Added versions of gpuKnn functions adapted for CMI. Refactoring needed.

This commit is contained in:
Pedro Martinez Mediano 2017-12-18 17:47:53 +00:00
parent 1fb4015df5
commit f1bd757dc0
3 changed files with 148 additions and 0 deletions

View File

@ -469,6 +469,26 @@ __global__ void gpuDigammas(float *g_digammas, int *g_nx, int *g_ny, int signall
}
__global__ void gpuDigammasCMI(float *g_digammas, int *g_nx, int *g_ny, int *g_nz, int signallength) {
const unsigned int i = threadIdx.x + blockDim.x*blockIdx.x;
if(i < signallength){
// Fetch n and put it in thread memory
double dgX = (double) g_nx[i];
double dgY = (double) g_ny[i];
double dgZ = (double) g_nz[i];
// In-place digamma calculation
digammaXp1(&dgX);
digammaXp1(&dgY);
digammaXp1(&dgZ);
// Copy back to global memory
g_digammas[i] = (float) (dgX + dgY - dgZ);
}
return;
}

View File

@ -57,6 +57,57 @@ int allocateDeviceMemory(int signalLength, int k, int dimx, int dimy,
return 1;
}
/**
* Allocate all necessary memory for the whole CMI calculation in a single call
* to cudaMalloc, and point the pointers to the right place.
*
* @param signalLength total number of samples given, including surrogates
* @param k nunmber of neighbours to find
* @param dimx dimension of source points
* @param dimy dimension of dest points
* @param dimz dimension of cond points
* @param source,dest,cond,distances,indexes,radii,nx,ny,nz,digammas device pointers
* @param pointset pointer to the data array in host memory
*
* @return error code
*/
int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
float **source, float **dest, float **cond, float **distances, int **indexes,
float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset) {
float *d_pointset;
int dims = dimx + dimy + dimz;
size_t mempointset = signalLength * dims * sizeof(float);
size_t memdistances = signalLength * k * sizeof(float);
size_t memindexes = signalLength * k * sizeof(int);
size_t memcounts = 3 * signalLength * sizeof(int);
size_t memdigammas = signalLength * sizeof(float);
size_t memtotal = mempointset + memdistances + memindexes + memcounts + memdigammas;
checkCudaErrors( cudaMalloc((void **) &d_pointset, memtotal) );
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess){
fprintf(stderr,"%s",cudaGetErrorString(error));
return 0;
}
checkCudaErrors( cudaMemcpy(d_pointset, pointset, mempointset, cudaMemcpyHostToDevice) );
*source = d_pointset;
*cond = *source + signalLength*dimx;
*dest = *cond + signalLength*dimz;
*distances = *dest + signalLength*dimy;
*radii = *distances + (k-1)*signalLength;
*indexes = (int *) (*distances + k*signalLength);
*nx = *indexes + signalLength;
*ny = *nx + signalLength;
*nz = *ny + signalLength;
*digammas = (float *) (*nz + signalLength);
return 1;
}
/**
* Free all the memory used in GPU (if allocated using allocateDeviceMemory.
*
@ -480,6 +531,26 @@ int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
}
int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
int *d_ny, int *d_nz, int signalLength) {
// Kernel parameters
dim3 threads(1,1,1);
dim3 grid(1,1,1);
threads.x = 512;
grid.x = (signalLength-1)/threads.x + 1;
// Launch kernel
gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors( cudaMemcpy(digammas, d_digammas, signalLength * sizeof(float), cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaDeviceSynchronize() );
return 1;
}
int parallelDigammas(float *digammas, int *nx, int *ny, int signalLength) {
int *d_nx, *d_ny;
@ -560,6 +631,53 @@ int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
return 1;
}
int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
float *d_digammas, int trialLength, int nchunks) {
float *d_sumDigammas;
int signalLength = trialLength * nchunks;
// Kernel parameters
dim3 threads(1,1,1);
dim3 grid(1,1,1);
threads.x = 512;
grid.x = (signalLength-1)/threads.x + 1;
checkCudaErrors( cudaMalloc((void **) &d_sumDigammas, nchunks * sizeof(int)) );
// Launch kernel to calculate (digamma(nx+1) + digamma(ny+1)), and leave
// results in GPU
gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
checkCudaErrors( cudaDeviceSynchronize() );
int offset_size = nchunks + 1;
int offsets[offset_size];
for (int i = 0; i < (nchunks+1); i++) { offsets[i] = i*trialLength; }
int *d_offsets;
checkCudaErrors(cudaMalloc((void **) &d_offsets, (nchunks + 1)*sizeof(int)));
checkCudaErrors(cudaMemcpy(d_offsets, offsets, (nchunks + 1)*sizeof(int), cudaMemcpyHostToDevice));
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
nchunks, d_offsets, d_offsets + 1);
// Allocate temporary storage
checkCudaErrors( cudaMalloc(&d_temp_storage, temp_storage_bytes) );
checkCudaErrors( cudaDeviceSynchronize() );
// Run sum-reduction
cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
nchunks, d_offsets, d_offsets + 1);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors( cudaFree(d_temp_storage) );
checkCudaErrors( cudaFree(d_offsets) );
checkCudaErrors(cudaMemcpy(sumDigammas, d_sumDigammas, nchunks*sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErrors( cudaFree(d_sumDigammas) );
return 1;
}
/**
* Calculate and sum digammas in chunks fully in GPU.
*/

View File

@ -10,6 +10,10 @@ int allocateDeviceMemory(int signalLength, int kth, int dimx, int dimy,
float **source, float **dest, float **distances, int **indexes,
float **radii, int **nx, int **ny, float **digammas, float *pointset);
int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
float **source, float **dest, float **cond, float **distances, int **indexes,
float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset);
int freeDeviceMemory(float *d_pointset);
int cudaFindKnn(int* h_bf_indexes, float* h_bf_distances, float* h_pointset,
@ -49,9 +53,15 @@ int d_cudaFindRSAll(int* d_bf_npointsrange, float* d_bf_pointset, float* d_bf_qu
int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
int *d_ny, int signalLength);
int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
int *d_ny, int *d_nz, int signalLength);
int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
float *d_digammas, int trialLength, int nchunks);
int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
float *d_digammas, int trialLength, int nchunks);
void device_reset(void);
void gpuWarmUp(void);