lammps/lib/cuda/cuda_data_kernel.cu

196 lines
3.9 KiB
Plaintext

__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx: {
dev_data[i] = buffer[i];
}
case xy: {
dev_data[i] = buffer[i];
}
case yx: {
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
}
case xyz: {
dev_data[i] = buffer[i];
}
case xzy: {
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
}
__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}