forked from lijiext/lammps
104 lines
2.7 KiB
Plaintext
104 lines
2.7 KiB
Plaintext
|
/* ----------------------------------------------------------------------
|
||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||
|
|
||
|
Original Version:
|
||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||
|
Steve Plimpton, sjplimp@sandia.gov
|
||
|
|
||
|
See the README file in the top-level LAMMPS directory.
|
||
|
|
||
|
-----------------------------------------------------------------------
|
||
|
|
||
|
USER-CUDA Package and associated modifications:
|
||
|
https://sourceforge.net/projects/lammpscuda/
|
||
|
|
||
|
Christian Trott, christian.trott@tu-ilmenau.de
|
||
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||
|
|
||
|
See the README file in the USER-CUDA directory.
|
||
|
|
||
|
This software is distributed under the GNU General Public License.
|
||
|
------------------------------------------------------------------------- */
|
||
|
|
||
|
//#define CUDA_PRECISION 1
|
||
|
#include "cuda_precision.h"
|
||
|
#include "cuda_common.h"
|
||
|
struct FFT_DATA {
|
||
|
FFT_FLOAT re;
|
||
|
FFT_FLOAT im;
|
||
|
};
|
||
|
|
||
|
#include "fft3d_cuda_cu.h"
|
||
|
#include "fft3d_cuda_kernel.cu"
|
||
|
#include <stdio.h>
|
||
|
|
||
|
void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow)
|
||
|
{
|
||
|
|
||
|
dim3 grid;
|
||
|
grid.x = nslow;
|
||
|
grid.y = nmid;
|
||
|
grid.z = 1;
|
||
|
dim3 threads;
|
||
|
threads.x = nfast;
|
||
|
threads.y = 1;
|
||
|
threads.z = 1;
|
||
|
cudaThreadSynchronize();
|
||
|
initfftdata_kernel <<< grid, threads, 0>>>(in, out);
|
||
|
cudaThreadSynchronize();
|
||
|
MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
|
||
|
}
|
||
|
|
||
|
|
||
|
void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
||
|
{
|
||
|
|
||
|
dim3 grid;
|
||
|
grid.x = nslow;
|
||
|
grid.y = nmid;
|
||
|
grid.z = 1;
|
||
|
dim3 threads;
|
||
|
threads.x = nfast * 2;
|
||
|
threads.y = 1;
|
||
|
threads.z = 1;
|
||
|
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
|
||
|
cudaThreadSynchronize();
|
||
|
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
|
||
|
}
|
||
|
|
||
|
void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
||
|
{
|
||
|
|
||
|
dim3 grid;
|
||
|
grid.x = nslow;
|
||
|
grid.y = nmid;
|
||
|
grid.z = 1;
|
||
|
dim3 threads;
|
||
|
threads.x = nfast * 2;
|
||
|
threads.y = 1;
|
||
|
threads.z = 1;
|
||
|
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
|
||
|
cudaThreadSynchronize();
|
||
|
}
|
||
|
void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||
|
{
|
||
|
|
||
|
dim3 grid;
|
||
|
grid.x = (ihi - ilo + 1);
|
||
|
grid.y = (jhi - jlo + 1);
|
||
|
grid.z = 1;
|
||
|
dim3 threads;
|
||
|
threads.x = (khi - klo + 1) * 2;
|
||
|
threads.y = 1;
|
||
|
threads.z = 1;
|
||
|
permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
|
||
|
cudaThreadSynchronize();
|
||
|
}
|
||
|
|
||
|
void FFTsyncthreads()
|
||
|
{
|
||
|
cudaThreadSynchronize();
|
||
|
}
|
||
|
|