lammps/lib/cuda/cuda_data.cu

enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet

#include "cuda_data_cu.h"
#include "cuda_wrapper_cu.h"
#include "cuda_data_kernel.cu"
#include <cstdio>

void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
  int size = n[0];

  if(n[1] > 0) size *= n[1];

  if(n[2] > 0) size *= n[2];

  dim3 threads;
  threads.x = 1;
  threads.y = 1;
  threads.z = 1;
  dim3 grid;
  grid.x = 1;
  grid.y = 1;
  grid.z = 1;

  if(size <= 128 * 30)
    threads.x = 32;
  else if(size <= 256 * 30)
    threads.x = 64;
  else if(size <= 512 * 30)
    threads.x = 128;
  else
    threads.x = 256;

  grid.x = ((size - 1) + threads.x) / threads.x;

  if(grid.x > 32000)
    grid.x = 32000;

  while(grid.x * grid.y * threads.x < size) grid.y++;

  float debugdata[size];
  //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
  size *= sizeof(double);
  printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
  CudaWrapper_UploadCudaData(host_data, buffer, size);
  CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
  cudaThreadSynchronize();
  CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
  double sum = 0;
  printf("debugdata: ");

  for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);

  printf("%lf \n", sum);

}

void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
  int size = n[0];

  if(n[1] > 0) size *= n[1];

  if(n[2] > 0) size *= n[2];

  dim3 threads;
  threads.x = 1;
  threads.y = 1;
  threads.z = 1;
  dim3 grid;
  grid.x = 1;
  grid.y = 1;
  grid.z = 1;

  if(size <= 128 * 30)
    threads.x = 32;
  else if(size <= 256 * 30)
    threads.x = 64;
  else if(size <= 512 * 30)
    threads.x = 128;
  else
    threads.x = 256;

  grid.x = ((size - 1) + threads.x) / threads.x;

  if(grid.x > 32000)
    grid.x = 32000;

  while(grid.x * grid.y * threads.x < size) grid.y++;

  size *= sizeof(double);

  CudaWrapper_UploadCudaData(host_data, buffer, size);
  CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
  cudaThreadSynchronize();
}

void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
  int size = n[0];

  if(n[1] > 0) size *= n[1];

  if(n[2] > 0) size *= n[2];

  dim3 threads;
  threads.x = 1;
  threads.y = 1;
  threads.z = 1;
  dim3 grid;
  grid.x = 1;
  grid.y = 1;
  grid.z = 1;

  if(size <= 128 * 30)
    threads.x = 32;
  else if(size <= 256 * 30)
    threads.x = 64;
  else if(size <= 512 * 30)
    threads.x = 128;
  else
    threads.x = 256;

  grid.x = ((size - 1) + threads.x) / threads.x;

  if(grid.x > 32000)
    grid.x = 32000;

  while(grid.x * grid.y * threads.x < size) grid.y++;

  size *= sizeof(float);

  CudaWrapper_UploadCudaData(host_data, buffer, size);
  CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
  cudaThreadSynchronize();
}

void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
  int size = n[0];

  if(n[1] > 0) size *= n[1];

  if(n[2] > 0) size *= n[2];

  dim3 threads;
  threads.x = 1;
  threads.y = 1;
  threads.z = 1;
  dim3 grid;
  grid.x = 1;
  grid.y = 1;
  grid.z = 1;

  if(size <= 128 * 30)
    threads.x = 32;
  else if(size <= 256 * 30)
    threads.x = 64;
  else if(size <= 512 * 30)
    threads.x = 128;
  else
    threads.x = 256;

  grid.x = ((size - 1) + threads.x) / threads.x;

  if(grid.x > 32000)
    grid.x = 32000;

  while(grid.x * grid.y * threads.x < size) grid.y++;

  size *= sizeof(float);

  CudaWrapper_UploadCudaData(host_data, buffer, size);
  CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
  cudaThreadSynchronize();
}

void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
  int size = n[0];

  if(n[1] > 0) size *= n[1];

  if(n[2] > 0) size *= n[2];

  dim3 threads;
  threads.x = 1;
  threads.y = 1;
  threads.z = 1;
  dim3 grid;
  grid.x = 1;
  grid.y = 1;
  grid.z = 1;

  if(size <= 128 * 30)
    threads.x = 32;
  else if(size <= 256 * 30)
    threads.x = 64;
  else if(size <= 512 * 30)
    threads.x = 128;
  else
    threads.x = 256;

  grid.x = ((size - 1) + threads.x) / threads.x;

  if(grid.x > 32000)
    grid.x = 32000;

  while(grid.x * grid.y * threads.x < size) grid.y++;

  size *= sizeof(int);

  CudaWrapper_UploadCudaData(host_data, buffer, size);
  CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
  cudaThreadSynchronize();
}

void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
{
}
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8922 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2012-10-08 23:29:55 +08:00			`enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet`

			`#include "cuda_data_cu.h"`
			`#include "cuda_wrapper_cu.h"`
			`#include "cuda_data_kernel.cu"`
			`#include <cstdio>`

			`void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`int size = n[0];`

			`if(n[1] > 0) size *= n[1];`

			`if(n[2] > 0) size *= n[2];`

			`dim3 threads;`
			`threads.x = 1;`
			`threads.y = 1;`
			`threads.z = 1;`
			`dim3 grid;`
			`grid.x = 1;`
			`grid.y = 1;`
			`grid.z = 1;`

			`if(size <= 128 * 30)`
			`threads.x = 32;`
			`else if(size <= 256 * 30)`
			`threads.x = 64;`
			`else if(size <= 512 * 30)`
			`threads.x = 128;`
			`else`
			`threads.x = 256;`

			`grid.x = ((size - 1) + threads.x) / threads.x;`

			`if(grid.x > 32000)`
			`grid.x = 32000;`

			`while(grid.x * grid.y * threads.x < size) grid.y++;`

			`float debugdata[size];`
			`//int* cu_debug=(int) CudaWrapper_AllocCudaData(sizesizeof(FLOAT));`
			`size *= sizeof(double);`
			`printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);`
			`CudaWrapper_UploadCudaData(host_data, buffer, size);`
			`CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double)buffer, (float)dev_data, n[0], n[1], n[2], mode);`
			`cudaThreadSynchronize();`
			`CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);`
			`double sum = 0;`
			`printf("debugdata: ");`

			`for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double) host_data)[i]) (debugdata[i] - ((double*) host_data)[i]);`

			`printf("%lf \n", sum);`

			`}`

			`void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`int size = n[0];`

			`if(n[1] > 0) size *= n[1];`

			`if(n[2] > 0) size *= n[2];`

			`dim3 threads;`
			`threads.x = 1;`
			`threads.y = 1;`
			`threads.z = 1;`
			`dim3 grid;`
			`grid.x = 1;`
			`grid.y = 1;`
			`grid.z = 1;`

			`if(size <= 128 * 30)`
			`threads.x = 32;`
			`else if(size <= 256 * 30)`
			`threads.x = 64;`
			`else if(size <= 512 * 30)`
			`threads.x = 128;`
			`else`
			`threads.x = 256;`

			`grid.x = ((size - 1) + threads.x) / threads.x;`

			`if(grid.x > 32000)`
			`grid.x = 32000;`

			`while(grid.x * grid.y * threads.x < size) grid.y++;`

			`size *= sizeof(double);`

			`CudaWrapper_UploadCudaData(host_data, buffer, size);`
			`CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double)buffer, (double)dev_data, n[0], n[1], n[2], mode);`
			`cudaThreadSynchronize();`
			`}`

			`void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`int size = n[0];`

			`if(n[1] > 0) size *= n[1];`

			`if(n[2] > 0) size *= n[2];`

			`dim3 threads;`
			`threads.x = 1;`
			`threads.y = 1;`
			`threads.z = 1;`
			`dim3 grid;`
			`grid.x = 1;`
			`grid.y = 1;`
			`grid.z = 1;`

			`if(size <= 128 * 30)`
			`threads.x = 32;`
			`else if(size <= 256 * 30)`
			`threads.x = 64;`
			`else if(size <= 512 * 30)`
			`threads.x = 128;`
			`else`
			`threads.x = 256;`

			`grid.x = ((size - 1) + threads.x) / threads.x;`

			`if(grid.x > 32000)`
			`grid.x = 32000;`

			`while(grid.x * grid.y * threads.x < size) grid.y++;`

			`size *= sizeof(float);`

			`CudaWrapper_UploadCudaData(host_data, buffer, size);`
			`CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float)buffer, (double)dev_data, n[0], n[1], n[2], mode);`
			`cudaThreadSynchronize();`
			`}`

			`void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`int size = n[0];`

			`if(n[1] > 0) size *= n[1];`

			`if(n[2] > 0) size *= n[2];`

			`dim3 threads;`
			`threads.x = 1;`
			`threads.y = 1;`
			`threads.z = 1;`
			`dim3 grid;`
			`grid.x = 1;`
			`grid.y = 1;`
			`grid.z = 1;`

			`if(size <= 128 * 30)`
			`threads.x = 32;`
			`else if(size <= 256 * 30)`
			`threads.x = 64;`
			`else if(size <= 512 * 30)`
			`threads.x = 128;`
			`else`
			`threads.x = 256;`

			`grid.x = ((size - 1) + threads.x) / threads.x;`

			`if(grid.x > 32000)`
			`grid.x = 32000;`

			`while(grid.x * grid.y * threads.x < size) grid.y++;`

			`size *= sizeof(float);`

			`CudaWrapper_UploadCudaData(host_data, buffer, size);`
			`CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float)buffer, (float)dev_data, n[0], n[1], n[2], mode);`
			`cudaThreadSynchronize();`
			`}`

			`void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`int size = n[0];`

			`if(n[1] > 0) size *= n[1];`

			`if(n[2] > 0) size *= n[2];`

			`dim3 threads;`
			`threads.x = 1;`
			`threads.y = 1;`
			`threads.z = 1;`
			`dim3 grid;`
			`grid.x = 1;`
			`grid.y = 1;`
			`grid.z = 1;`

			`if(size <= 128 * 30)`
			`threads.x = 32;`
			`else if(size <= 256 * 30)`
			`threads.x = 64;`
			`else if(size <= 512 * 30)`
			`threads.x = 128;`
			`else`
			`threads.x = 256;`

			`grid.x = ((size - 1) + threads.x) / threads.x;`

			`if(grid.x > 32000)`
			`grid.x = 32000;`

			`while(grid.x * grid.y * threads.x < size) grid.y++;`

			`size *= sizeof(int);`

			`CudaWrapper_UploadCudaData(host_data, buffer, size);`
			`CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int)buffer, (int)dev_data, n[0], n[1], n[2], mode);`
			`cudaThreadSynchronize();`
			`}`

			`void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)`
			`{`
			`}`