forked from lijiext/lammps
817 lines
32 KiB
Plaintext
817 lines
32 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
#define OFFSET 4096
|
|
__device__ int negativCUDA(float f)
|
|
{
|
|
return ((unsigned int)1<<31&(__float_as_int(f)))>>31;
|
|
}
|
|
|
|
__device__ void reduceBlock(float* data)
|
|
{
|
|
int p2=1;
|
|
while(p2*2<blockDim.x) p2*=2;
|
|
|
|
if(threadIdx.x<blockDim.x-p2)
|
|
data[threadIdx.x]+=data[threadIdx.x+p2];
|
|
__syncthreads();
|
|
for(int i=2;i<=p2;i*=2)
|
|
{
|
|
if(threadIdx.x<p2/i)
|
|
data[threadIdx.x]+=data[threadIdx.x+p2/i];
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
__device__ void reduceBlock(double* data)
|
|
{
|
|
int p2=1;
|
|
while(p2*2<blockDim.x) p2*=2;
|
|
|
|
if(threadIdx.x<blockDim.x-p2)
|
|
data[threadIdx.x]+=data[threadIdx.x+p2];
|
|
__syncthreads();
|
|
for(int i=2;i<=p2;i*=2)
|
|
{
|
|
if(threadIdx.x<p2/i)
|
|
data[threadIdx.x]+=data[threadIdx.x+p2/i];
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
extern __shared__ PPPM_FLOAT sharedmem[];
|
|
|
|
__global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)
|
|
{
|
|
PPPM_FLOAT my_fkx=unitkx*(int(threadIdx.x) - nx_pppm*(2*int(threadIdx.x)/nx_pppm));
|
|
PPPM_FLOAT my_fky=unitky*(int(blockIdx.y) - ny_pppm*(2*int(blockIdx.y)/ny_pppm));
|
|
PPPM_FLOAT my_fkz=unitkz*(int(blockIdx.x) - nz_pppm*(2*int(blockIdx.x)/nz_pppm));
|
|
|
|
if((blockIdx.x==0)&&(blockIdx.y==0)) fkx[threadIdx.x]=my_fkx;
|
|
if((blockIdx.x==0)&&(threadIdx.x==0)) fky[blockIdx.y]=my_fky;
|
|
if((threadIdx.x==0)&&(blockIdx.y==0)) fkz[blockIdx.x]=my_fkz;
|
|
__syncthreads();
|
|
if((blockIdx.x >= nzlo_fft)&&(blockIdx.x <=nzhi_fft)&&
|
|
(blockIdx.y >= nylo_fft)&&(blockIdx.y <=nyhi_fft)&&
|
|
(threadIdx.x>= nxlo_fft)&&(threadIdx.x<=nxhi_fft))
|
|
{
|
|
int n=((int(blockIdx.x)-nzlo_fft)*(nyhi_fft-nylo_fft+1)+int(blockIdx.y)-nylo_fft)*(nxhi_fft-nxlo_fft+1)+int(threadIdx.x)-nxlo_fft;
|
|
PPPM_FLOAT sqk = my_fkx*my_fkx + my_fky*my_fky + my_fkz*my_fkz;
|
|
PPPM_FLOAT vterm = (sqk==PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0)/sqk + PPPM_F(0.25)/(g_ewald*g_ewald));
|
|
vg[6*n+0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx*my_fkx;
|
|
vg[6*n+1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky*my_fky;
|
|
vg[6*n+2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz*my_fkz;
|
|
vg[6*n+3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fky;
|
|
vg[6*n+4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fkz;
|
|
vg[6*n+5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky*my_fkz;
|
|
|
|
}
|
|
}
|
|
|
|
__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z)
|
|
{
|
|
PPPM_FLOAT sx,sy,sz;
|
|
sz = sy = sx = PPPM_F(0.0);
|
|
for (int l = order-1; l >= 0; l--) {
|
|
sx = gf_b[l] + sx*x;
|
|
sy = gf_b[l] + sy*y;
|
|
sz = gf_b[l] + sz*z;
|
|
}
|
|
PPPM_FLOAT s = sx*sy*sz;
|
|
return s*s;
|
|
}
|
|
|
|
__global__ void setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,
|
|
int nbx,int nby,int nbz,
|
|
PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)
|
|
{
|
|
PPPM_FLOAT sqk;
|
|
int nx,ny,nz,kper,lper,mper,k,l,m;
|
|
PPPM_FLOAT snx,sny,snz,snx2,sny2,snz2;
|
|
PPPM_FLOAT argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
|
|
PPPM_FLOAT sum1,dot1,dot2;
|
|
PPPM_FLOAT numerator,denominator;
|
|
|
|
PPPM_FLOAT form=PPPM_F(1.0);
|
|
int n=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
m=blockIdx.x;
|
|
l=blockIdx.y;
|
|
k=threadIdx.x;
|
|
|
|
mper = m - nz_pppm*(2*m/nz_pppm);
|
|
snz = sin(PPPM_F(0.5)*unitkz*mper*zprd_slab/nz_pppm);
|
|
snz2 = snz*snz;
|
|
|
|
|
|
lper = l - ny_pppm*(2*l/ny_pppm);
|
|
sny = sin(PPPM_F(0.5)*unitky*lper*yprd/ny_pppm);
|
|
sny2 = sny*sny;
|
|
|
|
kper = k - nx_pppm*(2*k/nx_pppm);
|
|
snx = sin(PPPM_F(0.5)*unitkx*kper*xprd/nx_pppm);
|
|
snx2 = snx*snx;
|
|
|
|
sqk = pow(unitkx*kper,PPPM_F(2.0)) + pow(unitky*lper,PPPM_F(2.0)) +
|
|
pow(unitkz*mper,PPPM_F(2.0));
|
|
|
|
if (sqk != PPPM_F(0.0)) {
|
|
numerator = form*PPPM_F(12.5663706)/sqk;
|
|
denominator = gf_denom(snx2,sny2,snz2);
|
|
sum1 = PPPM_F(0.0);
|
|
for (nx = -nbx; nx <= nbx; nx++) {
|
|
qx = unitkx*(kper+nx_pppm*nx);
|
|
sx = exp(PPPM_F(-.25)*pow(qx/g_ewald,PPPM_F(2.0)));
|
|
wx = PPPM_F(1.0);
|
|
argx = PPPM_F(0.5)*qx*xprd/nx_pppm;
|
|
if (argx != PPPM_F(0.0)) wx = pow(sin(argx)/argx,order);
|
|
for (ny = -nby; ny <= nby; ny++) {
|
|
qy = unitky*(lper+ny_pppm*ny);
|
|
sy = exp(PPPM_F(-.25)*pow(qy/g_ewald,PPPM_F(2.0)));
|
|
wy = PPPM_F(1.0);
|
|
argy = PPPM_F(0.5)*qy*yprd/ny_pppm;
|
|
if (argy != PPPM_F(0.0)) wy = pow(sin(argy)/argy,order);
|
|
for (nz = -nbz; nz <= nbz; nz++) {
|
|
qz = unitkz*(mper+nz_pppm*nz);
|
|
sz = exp(PPPM_F(-.25)*pow(qz/g_ewald,PPPM_F(2.0)));
|
|
wz = PPPM_F(1.0);
|
|
argz = PPPM_F(0.5)*qz*zprd_slab/nz_pppm;
|
|
if (argz != PPPM_F(0.0)) wz = pow(sin(argz)/argz,order);
|
|
|
|
dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
|
|
dot2 = qx*qx+qy*qy+qz*qz;
|
|
sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,PPPM_F(2.0));
|
|
}
|
|
}
|
|
}
|
|
greensfn[n] = numerator*sum1/denominator;
|
|
} else greensfn[n] = PPPM_F(0.0);
|
|
}
|
|
|
|
__global__ void poisson_scale_kernel()
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
FFT_FLOAT scaleinv=FFT_F(1.0)/(gridDim.x*gridDim.y*blockDim.x);
|
|
work1[2*i] *= scaleinv * greensfn[i];
|
|
work1[2*i+1] *= scaleinv * greensfn[i];
|
|
}
|
|
|
|
__global__ void poisson_xgrad_kernel()
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
work2[2*i] = fkx[threadIdx.x] * work1[2*i+1];
|
|
work2[2*i+1] = -fkx[threadIdx.x] * work1[2*i];
|
|
}
|
|
|
|
__global__ void poisson_ygrad_kernel()
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
work2[2*i] = fky[blockIdx.y] * work1[2*i+1];
|
|
work2[2*i+1] = -fky[blockIdx.y] * work1[2*i];
|
|
}
|
|
|
|
__global__ void poisson_zgrad_kernel()
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
work2[2*i] = fkz[blockIdx.x] * work1[2*i+1];
|
|
work2[2*i+1] = -fkz[blockIdx.x] * work1[2*i];
|
|
}
|
|
|
|
__global__ void poisson_vdx_brick_kernel(int ilo,int jlo,int klo)
|
|
{
|
|
int k=blockIdx.x+klo;
|
|
k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
|
|
int j=blockIdx.y+jlo;
|
|
j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
|
|
int i=threadIdx.x+ilo;
|
|
i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
|
|
vdx_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
|
|
}
|
|
|
|
__global__ void poisson_vdy_brick_kernel(int ilo,int jlo,int klo)
|
|
{
|
|
int k=blockIdx.x+klo;
|
|
k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
|
|
int j=blockIdx.y+jlo;
|
|
j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
|
|
int i=threadIdx.x+ilo;
|
|
i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
|
|
vdy_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
|
|
}
|
|
|
|
__global__ void poisson_vdz_brick_kernel(int ilo,int jlo,int klo)
|
|
{
|
|
int k=blockIdx.x+klo;
|
|
k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1));
|
|
int j=blockIdx.y+jlo;
|
|
j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1));
|
|
int i=threadIdx.x+ilo;
|
|
i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1));
|
|
vdz_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)];
|
|
}
|
|
|
|
__global__ void poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag)
|
|
{
|
|
ENERGY_FLOAT scaleinv=FFT_F(1.0)/(nx_pppm*ny_pppm*nz_pppm);
|
|
int i=(blockIdx.x+nzlo_fft)*ny_pppm*nx_pppm+(blockIdx.y+nylo_fft)*nx_pppm+threadIdx.x+nxlo_fft;
|
|
ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
|
|
ENERGY_FLOAT myenergy= scaleinv*scaleinv * greensfn[i] * (work1[2*i]*work1[2*i] + work1[2*i+1]*work1[2*i+1]);
|
|
s_energy[threadIdx.x]=myenergy;
|
|
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
energy[blockIdx.x*ny_pppm+blockIdx.y]=s_energy[0];
|
|
if(vflag)
|
|
{
|
|
__syncthreads();
|
|
for (int j = 0; j < 6; j++)
|
|
{
|
|
s_energy[threadIdx.x]= myenergy*vg[((blockIdx.x*gridDim.y+blockIdx.y)*(blockDim.x)+threadIdx.x)*6+j];
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
virial[blockIdx.x*ny_pppm+blockIdx.y+j*nz_pppm*ny_pppm]=s_energy[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void sum_energy_kernel1(int vflag)
|
|
{
|
|
ENERGY_FLOAT myenergy=energy[(blockIdx.x*ny_pppm+threadIdx.x)];
|
|
ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
|
|
s_energy[threadIdx.x]=myenergy;
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
energy[blockIdx.x*ny_pppm]=s_energy[0];
|
|
if(vflag)
|
|
{
|
|
__syncthreads();
|
|
for (int j = 0; j < 6; j++)
|
|
{
|
|
myenergy=virial[blockIdx.x*ny_pppm+threadIdx.x+j*ny_pppm*nz_pppm];
|
|
s_energy[threadIdx.x]=myenergy;
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
virial[blockIdx.x*ny_pppm+j*ny_pppm*nz_pppm]=s_energy[0];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
__global__ void sum_energy_kernel2(int vflag)
|
|
{
|
|
ENERGY_FLOAT myenergy=energy[threadIdx.x*ny_pppm];
|
|
ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem;
|
|
s_energy[threadIdx.x]=myenergy;
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
energy[0]=s_energy[0];
|
|
if(vflag)
|
|
{
|
|
__syncthreads();
|
|
for (int j = 0; j < 6; j++)
|
|
{
|
|
myenergy=virial[threadIdx.x*ny_pppm+j*ny_pppm*nz_pppm];
|
|
s_energy[threadIdx.x]=myenergy;
|
|
__syncthreads();
|
|
reduceBlock(s_energy);
|
|
if(threadIdx.x==0)
|
|
virial[j]=s_energy[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ PPPM_FLOAT rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT* srho_coeff)
|
|
{
|
|
PPPM_FLOAT rho1d_tmp=PPPM_F(0.0);
|
|
for (int l = order-1; l >= 0; l--)
|
|
rho1d_tmp = srho_coeff[l*order+k-(1-order)/2] + rho1d_tmp*d;
|
|
return rho1d_tmp;
|
|
}
|
|
|
|
__global__ void particle_map_kernel(int* flag)
|
|
{
|
|
int i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
if(i<nlocal)
|
|
{
|
|
int nx,ny,nz;
|
|
PPPM_FLOAT shift=PPPM_F(0.5)-shiftone;//+OFFSET;
|
|
nx = (int)((_x[i]-_boxlo[0])*delxinv+shift);// - OFFSET;
|
|
ny = (int)((_x[i+nmax]-_boxlo[1])*delyinv+shift);// - OFFSET;
|
|
nz = (int)((_x[i+2*nmax]-_boxlo[2])*delzinv+shift);// - OFFSET;
|
|
|
|
part2grid[i] = nx;
|
|
part2grid[i+nmax] = ny;
|
|
part2grid[i+2*nmax] = nz;
|
|
// check that entire stencil around nx,ny,nz will fit in my 3d brick
|
|
if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
|
|
ny+nlower < nylo_out || ny+nupper > nyhi_out ||
|
|
nz+nlower < nzlo_out || nz+nupper > nzhi_out)
|
|
{flag[0]++;
|
|
debugdata[0]=i;
|
|
debugdata[1]=_boxlo[0];
|
|
debugdata[2]=_boxlo[1];
|
|
debugdata[3]=_boxlo[2];
|
|
debugdata[4]=nx;
|
|
debugdata[5]=ny;
|
|
debugdata[6]=nz;
|
|
debugdata[7]=_x[i];
|
|
debugdata[8]=_x[i+_nmax];
|
|
debugdata[9]=_x[i+2*_nmax];
|
|
debugdata[10]=nlocal;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void make_rho_kernelA()
|
|
{
|
|
int i,l,m,n,nx,ny,nz,mx,my,mz;
|
|
|
|
// clear 3d density array
|
|
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
|
|
i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
|
|
if(i < nlocal) {
|
|
|
|
PPPM_FLOAT dx,dy,dz,x0,y0,z0;
|
|
nx = part2grid[i];
|
|
ny = part2grid[i+nmax];
|
|
nz = part2grid[i+2*nmax];
|
|
dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
|
|
dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
|
|
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
|
|
|
|
z0 = delxinv*delyinv*delzinv * _q[i];
|
|
for (n = nlower; n <= nupper; n++)
|
|
{
|
|
mz = n+nz;
|
|
y0 = z0*rho1d(n,dz,rho_coeff);
|
|
for (m = nlower; m <= nupper; m++)
|
|
{
|
|
my = m+ny;
|
|
x0 = y0*rho1d(m,dy,rho_coeff);
|
|
for (l = nlower; l <= nupper; l++)
|
|
{
|
|
mx = l+nx;
|
|
int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out;
|
|
while(atomicAdd(&density_brick_int[mzyx],1)!=0) atomicAdd(&density_brick_int[mzyx],-1);
|
|
density_brick[mzyx]+=x0*rho1d(l,dx,rho_coeff);
|
|
__threadfence();
|
|
atomicAdd(&density_brick_int[mzyx],-1);
|
|
__syncthreads();
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void make_rho_kernel(int* flag,int read_threads_at_same_time)
|
|
{
|
|
int i,l,m,n,nx,ny,nz,mx,my,mz,a,b;
|
|
|
|
// clear 3d density array
|
|
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
|
|
|
|
int nelements=nupper-nlower+1;
|
|
int* idx=(int*) sharedmem;
|
|
int* sdensity_brick_int=&idx[blockDim.x];
|
|
PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x];
|
|
if(threadIdx.x<order*(order/2-(1-order)/2+1))
|
|
srho_coeff[threadIdx.x]=rho_coeff[threadIdx.x];
|
|
__syncthreads();
|
|
|
|
i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
|
|
if(false)
|
|
{
|
|
if(i < nlocal) {
|
|
|
|
PPPM_FLOAT dx,dy,dz,x0,y0,z0;
|
|
nx = part2grid[i];
|
|
ny = part2grid[i+nmax];
|
|
nz = part2grid[i+2*nmax];
|
|
dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
|
|
dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
|
|
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
|
|
|
|
z0 = delxinv*delyinv*delzinv * _q[i];
|
|
for (n = nlower; n <= nupper; n++)
|
|
{
|
|
mz = n+nz;
|
|
y0 = z0*rho1d(n,dz,srho_coeff);
|
|
for (m = nlower; m <= nupper; m++)
|
|
{
|
|
my = m+ny;
|
|
x0 = y0*rho1d(m,dy,srho_coeff);
|
|
for (l = nlower; l <= nupper; l++)
|
|
{
|
|
mx = l+nx;
|
|
int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out;
|
|
|
|
a=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
|
|
b=(atomicAdd(&density_brick_int[mzyx],a)|a);
|
|
if(((b)&(0x7c000000))&&(not((b)&(0x80000000))))
|
|
{
|
|
flag[1]++;
|
|
if((b)&(0x60000000)) flag[0]++;
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
{
|
|
|
|
PPPM_FLOAT dx,dy,dz,x0,y0,z0,qtmp;
|
|
if(i < nlocal)
|
|
{
|
|
qtmp= _q[i];
|
|
nx = part2grid[i];
|
|
ny = part2grid[i+nmax];
|
|
nz = part2grid[i+2*nmax];
|
|
dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
|
|
dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
|
|
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
|
|
z0 = delxinv*delyinv*delzinv * qtmp;
|
|
}
|
|
else
|
|
{
|
|
nx=ny=nz=1; dx=dy=dz=PPPM_F(0.1);
|
|
}
|
|
__syncthreads();
|
|
for (n = nlower; n <= nupper; n++)
|
|
{
|
|
mz = n+nz;
|
|
y0 = z0*rho1d(n,dz,srho_coeff);
|
|
for (m = nlower; m <= nupper; m++)
|
|
{
|
|
my = m+ny;
|
|
x0 = y0*rho1d(m,dy,srho_coeff);
|
|
if(i<nlocal)
|
|
{
|
|
idx[threadIdx.x]=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+nx+nlower-nxlo_out;
|
|
for (l = nlower; l <= nupper; l++)
|
|
{
|
|
sdensity_brick_int[threadIdx.x*nelements+l-nlower]=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
|
|
}
|
|
}
|
|
else idx[threadIdx.x]=-1;
|
|
__syncthreads();
|
|
|
|
for(int ii=0;ii<blockDim.x;ii+=read_threads_at_same_time)
|
|
{
|
|
int kk=threadIdx.x/nelements;
|
|
|
|
if((threadIdx.x<nelements*read_threads_at_same_time)&&(kk+ii<blockDim.x)&&(idx[ii+kk]>-1))
|
|
{
|
|
a=sdensity_brick_int[ii*nelements+threadIdx.x];
|
|
//if(a*a>1e-100)
|
|
b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a);
|
|
//else
|
|
//b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a);
|
|
if(((b)&(0x7c000000))&&(not((b)&(0x80000000))))
|
|
{
|
|
flag[1]++;
|
|
if((b)&(0x60000000)) flag[0]++;
|
|
}
|
|
}
|
|
}
|
|
__syncthreads(); //*/
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
__global__ void scale_rho_kernel()
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
density_brick[i]=(1.0/density_intScale)*density_brick_int[i];
|
|
}
|
|
|
|
__global__ void fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int* flag) //20*x64 0.36
|
|
{
|
|
int i;
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of E-field on particle
|
|
i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
int* idx=(int*) sharedmem;
|
|
PPPM_FLOAT* tmp_brick=(PPPM_FLOAT*) &idx[blockDim.x];
|
|
PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &tmp_brick[3*blockDim.x*elements_per_thread];
|
|
if(threadIdx.x<order*(order/2-(1-order)/2+1))
|
|
srho_coeff[threadIdx.x]=rho_coeff[threadIdx.x];
|
|
__syncthreads();
|
|
{
|
|
int l,m,n,nx,ny,nz,my,mz;
|
|
PPPM_FLOAT dx,dy,dz,x0,y0,z0;
|
|
PPPM_FLOAT ek[3];
|
|
if(i<nlocal)
|
|
{
|
|
nx = part2grid[i];
|
|
ny = part2grid[i+nmax];
|
|
nz = part2grid[i+2*nmax];
|
|
dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv;
|
|
dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv;
|
|
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv;
|
|
|
|
ek[0] = ek[1] = ek[2] = PPPM_F(0.0);
|
|
}
|
|
else
|
|
{
|
|
nx=ny=nz=1; dx=dy=dz=PPPM_F(0.1);
|
|
}
|
|
__syncthreads();
|
|
for (n = nlower; n <= nupper; n++)
|
|
{
|
|
mz = n+nz;
|
|
z0 = rho1d(n,dz,srho_coeff);
|
|
for (m = nlower; m <= nupper; m++)
|
|
{
|
|
my = m+ny;
|
|
y0 = z0*rho1d(m,dy,srho_coeff);
|
|
|
|
|
|
if(i<nlocal)
|
|
idx[threadIdx.x]=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+nx+nlower-nxlo_out;
|
|
else idx[threadIdx.x]=-1;
|
|
__syncthreads();
|
|
|
|
for(int ii=0;ii<blockDim.x;ii+=read_threads_at_same_time)
|
|
{
|
|
int kk=threadIdx.x/elements_per_thread;
|
|
|
|
if((threadIdx.x<elements_per_thread*read_threads_at_same_time)&&(kk+ii<blockDim.x)&&(idx[ii+kk]>-1))
|
|
{
|
|
tmp_brick[ii*elements_per_thread+threadIdx.x]=vdx_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
|
|
tmp_brick[(ii+blockDim.x)*elements_per_thread+threadIdx.x]=vdy_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
|
|
tmp_brick[(ii+2*blockDim.x)*elements_per_thread+threadIdx.x]=vdz_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread];
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
if(i<nlocal)
|
|
for (l = nlower; l <= nupper; l++)
|
|
{
|
|
x0 = y0*rho1d(l,dx,srho_coeff);
|
|
|
|
ek[0] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower];
|
|
ek[1] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower+blockDim.x*elements_per_thread];
|
|
ek[2] -= x0*tmp_brick[threadIdx.x*elements_per_thread+l-nlower+2*blockDim.x*elements_per_thread];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
// convert E-field to force
|
|
|
|
|
|
_f[i] += qqrd2e*_q[i]*ek[0];
|
|
_f[i+nmax] += qqrd2e*_q[i]*ek[1];
|
|
_f[i+2*nmax] += qqrd2e*_q[i]*ek[2];
|
|
}
|
|
}
|
|
|
|
__global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf)
|
|
{
|
|
ENERGY_FLOAT* dipole =(ENERGY_FLOAT*) sharedmem;
|
|
int i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
if(i<nlocal)
|
|
dipole[threadIdx.x]=_q[i]*_x[i+2*nmax];
|
|
else
|
|
dipole[threadIdx.x]=ENERGY_F(0.0);
|
|
__syncthreads();
|
|
reduceBlock(dipole);
|
|
if(threadIdx.x==0) buf[blockIdx.x]=dipole[0];
|
|
}
|
|
|
|
__global__ void slabcorr_force_kernel(F_FLOAT ffact)
|
|
{
|
|
int i=blockIdx.x*blockDim.x+threadIdx.x;
|
|
if(i<nlocal)
|
|
_f[i+2*nmax] += qqrd2e*_q[i]*ffact;
|
|
}
|
|
|
|
|
|
__global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)+1]=0;
|
|
}
|
|
|
|
__global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(slabflag){
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
out[2*(((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1)+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}else{
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
out[2*(((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1)+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
out[2*((((blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+(nzhi_out-nzlo_in))*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}
|
|
|
|
__global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+(2*(nyhi_in+1)-nylo_in-nyhi_out)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+(nyhi_out-nylo_in))*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}
|
|
|
|
__global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}
|
|
|
|
__global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(slabflag){
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}else{
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxlo_in-nxlo_out];
|
|
}
|
|
|
|
__global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(slabflag)
|
|
{
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}else{
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nylo_in-nylo_out)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}
|
|
}
|
|
|
|
__global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzlo_in-nzlo_out)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}
|
|
|
|
__global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
|
|
{
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzhi_out-nzhi_in)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*(((blockIdx.x*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x+nzhi_in-nzlo_out+1)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(slabflag){
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((nzhi_in-nzlo_in+2-nupper-slabflag+blockIdx.x)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}else{
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nyhi_out-nyhi_in)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y+nyhi_in-nylo_out+1)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxlo_in-nxlo_out)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x+2*(nxhi_in+1)-nxlo_in-nxhi_out)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x];
|
|
|
|
if(blockIdx.x<nzlo_in-nzlo_out)
|
|
if(blockIdx.y<nylo_in-nylo_out)
|
|
if(threadIdx.x<nxhi_out-nxhi_in)
|
|
out[2*((((blockIdx.x+2*(nzhi_in+1)-nzlo_in-nzhi_out)*(nyhi_in-nylo_in+1)+blockIdx.y+2*(nyhi_in+1)-nylo_in-nyhi_out)*(nxhi_in-nxlo_in+1))+threadIdx.x)]+=in[(((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1))+threadIdx.x+nxhi_in-nxlo_out+1];
|
|
}
|
|
}
|