forked from lijiext/lammps
338 lines
10 KiB
Plaintext
338 lines
10 KiB
Plaintext
|
/* ----------------------------------------------------------------------
|
||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||
|
|
||
|
Original Version:
|
||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||
|
Steve Plimpton, sjplimp@sandia.gov
|
||
|
|
||
|
See the README file in the top-level LAMMPS directory.
|
||
|
|
||
|
-----------------------------------------------------------------------
|
||
|
|
||
|
USER-CUDA Package and associated modifications:
|
||
|
https://sourceforge.net/projects/lammpscuda/
|
||
|
|
||
|
Christian Trott, christian.trott@tu-ilmenau.de
|
||
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||
|
|
||
|
See the README file in the USER-CUDA directory.
|
||
|
|
||
|
This software is distributed under the GNU General Public License.
|
||
|
------------------------------------------------------------------------- */
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include "cuda_shared.h"
|
||
|
#include "cuda_common.h"
|
||
|
#include "cuda_wrapper_cu.h"
|
||
|
#include "cuda_wrapper_kernel.cu"
|
||
|
|
||
|
static int CudaWrapper_total_gpu_mem = 0;
|
||
|
static double CudaWrapper_total_upload_time = 0;
|
||
|
static double CudaWrapper_total_download_time = 0;
|
||
|
static double CudaWrapper_cpubuffer_upload_time = 0;
|
||
|
static double CudaWrapper_cpubuffer_download_time = 0;
|
||
|
static cudaStream_t* streams;
|
||
|
static int nstreams = 0;
|
||
|
|
||
|
void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: debug mode on\n");)
|
||
|
|
||
|
#if __DEVICE_EMULATION__
|
||
|
|
||
|
printf("# CUDA: emulation mode on\n");
|
||
|
|
||
|
#else
|
||
|
|
||
|
// modified from cutil.h
|
||
|
static int deviceCount = 0;
|
||
|
static bool sharedmode = false;
|
||
|
|
||
|
if(deviceCount && !sharedmode) return;
|
||
|
|
||
|
if(deviceCount && sharedmode) cudaThreadExit();
|
||
|
|
||
|
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
|
||
|
|
||
|
if(deviceCount == 0) {
|
||
|
fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);)
|
||
|
|
||
|
cudaDeviceProp deviceProp[deviceCount];
|
||
|
|
||
|
for(int i = 0; i < deviceCount; i++)
|
||
|
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i));
|
||
|
|
||
|
|
||
|
int dev_list[deviceCount];
|
||
|
|
||
|
for(int i = 0; i < deviceCount; i++) dev_list[i] = i;
|
||
|
|
||
|
for(int i = 0; i < deviceCount; i++) {
|
||
|
for(int j = 0; j < deviceCount - 1 - i; j++)
|
||
|
if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) {
|
||
|
int k = dev_list[j];
|
||
|
dev_list[j] = dev_list[j + 1];
|
||
|
dev_list[j + 1] = k;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for(int i = 0; i < deviceCount; i++) {
|
||
|
if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true;
|
||
|
|
||
|
cudaSetDevice(i);
|
||
|
cudaSetDeviceFlags(cudaDeviceMapHost);
|
||
|
}
|
||
|
|
||
|
if(sharedmode) {
|
||
|
if(ppn && (me % ppn + 1) > deviceCount) {
|
||
|
printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n");
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
int devicea = me % ppn;
|
||
|
|
||
|
if(devicelist) devicea = devicelist[devicea];
|
||
|
else
|
||
|
devicea = dev_list[devicea];
|
||
|
|
||
|
if(devicea >= deviceCount) {
|
||
|
printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount);
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
MYDBG(
|
||
|
printf(" # CUDA myid: %i take device: %i\n", me, devicea);
|
||
|
)
|
||
|
CUDA_SAFE_CALL(cudaSetDevice(devicea));
|
||
|
} else {
|
||
|
CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount));
|
||
|
}
|
||
|
|
||
|
cudaThreadSynchronize();
|
||
|
|
||
|
int dev;
|
||
|
CUDA_SAFE_CALL(cudaGetDevice(&dev));
|
||
|
|
||
|
if(deviceProp[dev].major < 1) {
|
||
|
fprintf(stderr, "CUDA error: device does not support CUDA.\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
} else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) {
|
||
|
fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) {
|
||
|
fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor);
|
||
|
}
|
||
|
|
||
|
if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) {
|
||
|
fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
|
||
|
fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
|
||
|
MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
|
||
|
|
||
|
MYDBG
|
||
|
(
|
||
|
printf("name = %s\n", deviceProp[dev].name);
|
||
|
printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
|
||
|
printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
|
||
|
printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
|
||
|
printf("warpSize = %i\n", deviceProp[dev].warpSize);
|
||
|
printf("memPitch = %i\n", deviceProp[dev].memPitch);
|
||
|
printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
|
||
|
printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
|
||
|
printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
|
||
|
printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
|
||
|
printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
|
||
|
printf("clockRate = %i\n", deviceProp[dev].clockRate);
|
||
|
printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
|
||
|
printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
|
||
|
printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
|
||
|
printf("computeMode = %i\n", deviceProp[dev].computeMode);
|
||
|
)
|
||
|
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void* CudaWrapper_AllocCudaData(unsigned nbytes)
|
||
|
{
|
||
|
void* dev_data;
|
||
|
CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes));
|
||
|
MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);)
|
||
|
CudaWrapper_total_gpu_mem += nbytes;
|
||
|
return dev_data;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
|
||
|
cudaThreadSynchronize();
|
||
|
timespec time1, time2;
|
||
|
clock_gettime(CLOCK_REALTIME, &time1);
|
||
|
CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
|
||
|
clock_gettime(CLOCK_REALTIME, &time2);
|
||
|
CudaWrapper_total_upload_time +=
|
||
|
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||
|
cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]);
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||
|
cudaThreadSynchronize();
|
||
|
timespec time1, time2;
|
||
|
clock_gettime(CLOCK_REALTIME, &time1);
|
||
|
CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
|
||
|
clock_gettime(CLOCK_REALTIME, &time2);
|
||
|
CudaWrapper_total_download_time +=
|
||
|
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||
|
cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]);
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);)
|
||
|
CUDA_SAFE_CALL(cudaFree(dev_data));
|
||
|
CudaWrapper_total_gpu_mem -= nbytes;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);)
|
||
|
CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes));
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);)
|
||
|
CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice));
|
||
|
}
|
||
|
|
||
|
void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined)
|
||
|
{
|
||
|
void* host_data;
|
||
|
int flags = 0;
|
||
|
|
||
|
if(mapped) flags = flags | cudaHostAllocMapped;
|
||
|
|
||
|
if(writeCombined) flags = flags | cudaHostAllocWriteCombined;
|
||
|
|
||
|
CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags));
|
||
|
// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
|
||
|
MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);)
|
||
|
return host_data;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_FreePinnedHostData(void* host_data)
|
||
|
{
|
||
|
MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);)
|
||
|
|
||
|
if(host_data)
|
||
|
CUDA_SAFE_CALL(cudaFreeHost(host_data));
|
||
|
}
|
||
|
|
||
|
void cuda_check_error(char* comment)
|
||
|
{
|
||
|
printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
|
||
|
}
|
||
|
|
||
|
int CudaWrapper_CheckMemUseage()
|
||
|
{
|
||
|
size_t free, total;
|
||
|
cudaMemGetInfo(&free, &total);
|
||
|
return total - free; //possible with cuda 3.0 ???
|
||
|
//return CudaWrapper_total_gpu_mem;
|
||
|
}
|
||
|
|
||
|
double CudaWrapper_CheckUploadTime(bool reset)
|
||
|
{
|
||
|
if(reset) CudaWrapper_total_upload_time = 0.0;
|
||
|
|
||
|
return CudaWrapper_total_upload_time;
|
||
|
}
|
||
|
|
||
|
double CudaWrapper_CheckDownloadTime(bool reset)
|
||
|
{
|
||
|
if(reset) CudaWrapper_total_download_time = 0.0;
|
||
|
|
||
|
return CudaWrapper_total_download_time;
|
||
|
}
|
||
|
|
||
|
double CudaWrapper_CheckCPUBufUploadTime(bool reset)
|
||
|
{
|
||
|
if(reset) CudaWrapper_cpubuffer_upload_time = 0.0;
|
||
|
|
||
|
return CudaWrapper_cpubuffer_upload_time;
|
||
|
}
|
||
|
|
||
|
double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
|
||
|
{
|
||
|
if(reset) CudaWrapper_cpubuffer_download_time = 0.0;
|
||
|
|
||
|
return CudaWrapper_cpubuffer_download_time;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_AddCPUBufUploadTime(double dt)
|
||
|
{
|
||
|
CudaWrapper_cpubuffer_upload_time += dt;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_AddCPUBufDownloadTime(double dt)
|
||
|
{
|
||
|
CudaWrapper_cpubuffer_download_time += dt;
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_Sync()
|
||
|
{
|
||
|
cudaThreadSynchronize();
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_SyncStream(int stream)
|
||
|
{
|
||
|
cudaStreamSynchronize(streams[stream]);
|
||
|
}
|
||
|
|
||
|
void CudaWrapper_AddStreams(int n)
|
||
|
{
|
||
|
cudaStream_t* new_streams = new cudaStream_t[nstreams + n];
|
||
|
|
||
|
for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i];
|
||
|
|
||
|
for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]);
|
||
|
|
||
|
if(nstreams > 0)
|
||
|
delete [] streams;
|
||
|
|
||
|
streams = new_streams;
|
||
|
nstreams += n;
|
||
|
}
|
||
|
|
||
|
void* CudaWrapper_returnStreams()
|
||
|
{
|
||
|
return (void*) streams;
|
||
|
}
|
||
|
|
||
|
int CudaWrapper_returnNStreams()
|
||
|
{
|
||
|
return nstreams;
|
||
|
}
|
||
|
|