lammps/lib/cuda/binning.cu

197 lines
9.9 KiB
Plaintext

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifdef CUDA_USE_BINNING
#include <stdio.h>
#define MY_PREFIX binning
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "binning_cu.h"
#include "binning_kernel.cu"
void Cuda_PreBinning(cuda_shared_data* sdata)
{
// initialize only on first call
short init = 0;
if(! init)
{
init = 1;
int cuda_dummy_type = sdata->atom.ntypes + 1;
X_FLOAT outside[3] =
{
(sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0,
(sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0,
(sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0
};
cudaMemcpyToSymbol("binned_size_all" , & sdata->atom.binned_type.dim[0] , sizeof(unsigned) );
cudaMemcpyToSymbol("cuda_dummy_type" , & cuda_dummy_type , sizeof(int) );
cudaMemcpyToSymbol("outside" , & outside , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , sizeof(X_FLOAT)*3);
// bin_nmax == blockDim.x
// printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type));
// int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!!
}
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1);
dim3 threads(sdata->domain.bin_nmax, 1, 1);
MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);)
MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z); )
PreBinning_Kernel<<<grid, threads>>> ();
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError())));
CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed");
}
void Cuda_Binning(cuda_shared_data* sdata)
{
MYDBG( // check assumption in debug mode
if(sdata->atom.x.dim[1] != 3)
{
printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n");
return;
}
)
// initialize only on first call
short init = 0;
if(! init)
{
init = 0;
X_FLOAT const_rez_bin_size[3] =
{
(1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
(1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
(1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
};
cudaMemcpyToSymbol("bin_error_count" , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1);
cudaMemcpyToSymbol("rez_bin_size" , & const_rez_bin_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) );
cudaMemcpyToSymbol(MY_CONST(bin_nmax) , & sdata->domain.bin_nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binpos) , & sdata->atom.binpos .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nghost) , & sdata->atom.nghost , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
}
dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1);
MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
dim3 threads(64, 1, 1);
cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1);
int binning_error_l[1];
Binning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag,
0,
sdata->atom.rmass_flag
);
cudaThreadSynchronize();
cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
if(binning_error_l[0]!=0)
{
printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]);
}
CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0);
MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); )
Binning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag,
sdata->atom.nlocal,
sdata->atom.rmass_flag
);
cudaThreadSynchronize();
cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]);
}
void Cuda_ReverseBinning(cuda_shared_data* sdata)
{
// initialize only on first call
short init = 0;
if(! init)
{
init = 0;
cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) );
cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
}
dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1);
MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
dim3 threads(32, 1, 1);
ReverseBinning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag
);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed");
}
#endif