lammps/lib/cuda/fft3d_cuda.cu

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov 

   See the README file in the top-level LAMMPS directory. 

   ----------------------------------------------------------------------- 

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/ 

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany 

   See the README file in the USER-CUDA directory. 

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */

//#define CUDA_PRECISION 1
#include "cuda_precision.h"
#include "cuda_common.h"
struct  FFT_DATA{
  FFT_FLOAT re;
  FFT_FLOAT im;
};

#include "fft3d_cuda_cu.h"
#include "fft3d_cuda_kernel.cu"
#include <stdio.h>

void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow)
{
	
  dim3 grid;
  grid.x=nslow;
  grid.y=nmid;
  grid.z=1;
  dim3 threads;
  threads.x=nfast;
  threads.y=1;
  threads.z=1;
  cudaThreadSynchronize();
  initfftdata_kernel<<<grid,threads,0>>>(in,out);
  cudaThreadSynchronize();
  MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
}


void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
{
	
  dim3 grid;
  grid.x=nslow;
  grid.y=nmid;
  grid.z=1;
  dim3 threads;
  threads.x=nfast*2;
  threads.y=1;
  threads.z=1;
  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
  cudaThreadSynchronize();
  MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
}

void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
{
	
  dim3 grid;
  grid.x=nslow;
  grid.y=nmid;
  grid.z=1;
  dim3 threads;
  threads.x=nfast*2;
  threads.y=1;
  threads.z=1;
  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
  cudaThreadSynchronize();
}
void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
{
	
  dim3 grid;
  grid.x=(ihi-ilo+1);
  grid.y=(jhi-jlo+1);
  grid.z=1;
  dim3 threads;
  threads.x=(khi-klo+1)*2;
  threads.y=1;
  threads.z=1;
  permute_part_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo);
  cudaThreadSynchronize();
 }
 
 void FFTsyncthreads()
 {
 	cudaThreadSynchronize();
 }
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6262 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-06-01 05:09:51 +08:00			`/* ----------------------------------------------------------------------`
			`LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator`

			`Original Version:`
			`http://lammps.sandia.gov, Sandia National Laboratories`
			`Steve Plimpton, sjplimp@sandia.gov`

			`See the README file in the top-level LAMMPS directory.`

			`-----------------------------------------------------------------------`

			`USER-CUDA Package and associated modifications:`
			`https://sourceforge.net/projects/lammpscuda/`

			`Christian Trott, christian.trott@tu-ilmenau.de`
			`Lars Winterfeld, lars.winterfeld@tu-ilmenau.de`
			`Theoretical Physics II, University of Technology Ilmenau, Germany`

			`See the README file in the USER-CUDA directory.`

			`This software is distributed under the GNU General Public License.`
			`------------------------------------------------------------------------- */`

			`//#define CUDA_PRECISION 1`
			`#include "cuda_precision.h"`
			`#include "cuda_common.h"`
			`struct FFT_DATA{`
			`FFT_FLOAT re;`
			`FFT_FLOAT im;`
			`};`

			`#include "fft3d_cuda_cu.h"`
			`#include "fft3d_cuda_kernel.cu"`
			`#include <stdio.h>`

			`void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow)`
			`{`

			`dim3 grid;`
			`grid.x=nslow;`
			`grid.y=nmid;`
			`grid.z=1;`
			`dim3 threads;`
			`threads.x=nfast;`
			`threads.y=1;`
			`threads.z=1;`
			`cudaThreadSynchronize();`
			`initfftdata_kernel<<<grid,threads,0>>>(in,out);`
			`cudaThreadSynchronize();`
			`MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));`
			`}`


			`void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)`
			`{`

			`dim3 grid;`
			`grid.x=nslow;`
			`grid.y=nmid;`
			`grid.z=1;`
			`dim3 threads;`
			`threads.x=nfast*2;`
			`threads.y=1;`
			`threads.z=1;`
			`permute_kernel<<<grid,threads,0>>>((FFT_FLOAT)in,(FFT_FLOAT)out);`
			`cudaThreadSynchronize();`
			`MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));`
			`}`

			`void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)`
			`{`

			`dim3 grid;`
			`grid.x=nslow;`
			`grid.y=nmid;`
			`grid.z=1;`
			`dim3 threads;`
			`threads.x=nfast*2;`
			`threads.y=1;`
			`threads.z=1;`
			`permute_kernel<<<grid,threads,0>>>((FFT_FLOAT)in,(FFT_FLOAT)out);`
			`cudaThreadSynchronize();`
			`}`
			`void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)`
			`{`

			`dim3 grid;`
			`grid.x=(ihi-ilo+1);`
			`grid.y=(jhi-jlo+1);`
			`grid.z=1;`
			`dim3 threads;`
			`threads.x=(khi-klo+1)*2;`
			`threads.y=1;`
			`threads.z=1;`
			`permute_part_kernel<<<grid,threads,0>>>((FFT_FLOAT)in,(FFT_FLOAT)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo);`
			`cudaThreadSynchronize();`
			`}`

			`void FFTsyncthreads()`
			`{`
			`cudaThreadSynchronize();`
			`}`