Added versions of gpuKnn functions adapted for CMI. Refactoring needed.

2017-12-18 17:47:53 +00:00 · 2017-12-18 17:47:53 +00:00 · f1bd757dc0
parent 1fb4015df5
commit f1bd757dc0
3 changed files with 148 additions and 0 deletions
--- a/cuda/gpuKnnBF_kernel.cu
+++ b/cuda/gpuKnnBF_kernel.cu
@ -469,6 +469,26 @@ __global__ void gpuDigammas(float *g_digammas, int *g_nx, int *g_ny, int signall
 }


+__global__ void gpuDigammasCMI(float *g_digammas, int *g_nx, int *g_ny, int *g_nz, int signallength) {
+  const unsigned int i = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if(i < signallength){
+    // Fetch n and put it in thread memory
+    double dgX = (double) g_nx[i];
+    double dgY = (double) g_ny[i];
+    double dgZ = (double) g_nz[i];
+
+    // In-place digamma calculation
+    digammaXp1(&dgX);
+    digammaXp1(&dgY);
+    digammaXp1(&dgZ);
+
+    // Copy back to global memory
+    g_digammas[i] = (float) (dgX + dgY - dgZ);
+  }
+  return;
+}
+



--- a/cuda/gpuKnnLibrary.c
+++ b/cuda/gpuKnnLibrary.c
@ -57,6 +57,57 @@ int allocateDeviceMemory(int signalLength, int k, int dimx, int dimy,
  return 1;
 }

+/**
+ * Allocate all necessary memory for the whole CMI calculation in a single call
+ * to cudaMalloc, and point the pointers to the right place.
+ *
+ * @param signalLength total number of samples given, including surrogates
+ * @param k nunmber of neighbours to find
+ * @param dimx dimension of source points
+ * @param dimy dimension of dest points
+ * @param dimz dimension of cond points
+ * @param source,dest,cond,distances,indexes,radii,nx,ny,nz,digammas device pointers
+ * @param pointset pointer to the data array in host memory
+ *
+ * @return error code
+ */
+int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
+    float **source, float **dest, float **cond, float **distances, int **indexes,
+    float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset) {
+
+  float *d_pointset;
+  int dims = dimx + dimy + dimz;
+  size_t mempointset = signalLength * dims * sizeof(float);
+  size_t memdistances = signalLength * k * sizeof(float);
+  size_t memindexes = signalLength * k * sizeof(int);
+  size_t memcounts = 3 * signalLength * sizeof(int);
+  size_t memdigammas = signalLength * sizeof(float);
+  size_t memtotal = mempointset + memdistances + memindexes + memcounts + memdigammas;
+
+  checkCudaErrors( cudaMalloc((void **) &d_pointset, memtotal) );
+
+  cudaError_t error = cudaGetLastError();
+  if(error!=cudaSuccess){
+    fprintf(stderr,"%s",cudaGetErrorString(error));
+    return 0;
+  }
+
+  checkCudaErrors( cudaMemcpy(d_pointset, pointset, mempointset, cudaMemcpyHostToDevice) );
+
+  *source = d_pointset;
+  *cond = *source + signalLength*dimx;
+  *dest = *cond + signalLength*dimz;
+  *distances = *dest + signalLength*dimy;
+  *radii = *distances + (k-1)*signalLength;
+  *indexes = (int *) (*distances + k*signalLength);
+  *nx = *indexes + signalLength;
+  *ny = *nx + signalLength;
+  *nz = *ny + signalLength;
+  *digammas = (float *) (*nz + signalLength);
+
+  return 1;
+}
+
 /**
 * Free all the memory used in GPU (if allocated using allocateDeviceMemory.
 *
@ -480,6 +531,26 @@ int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
 }


+int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
+    int *d_ny, int *d_nz, int signalLength) {
+
+  // Kernel parameters
+  dim3 threads(1,1,1);
+  dim3 grid(1,1,1);
+  threads.x = 512;
+  grid.x = (signalLength-1)/threads.x + 1;
+
+  // Launch kernel
+  gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
+  checkCudaErrors( cudaDeviceSynchronize() );
+
+  checkCudaErrors( cudaMemcpy(digammas, d_digammas, signalLength * sizeof(float), cudaMemcpyDeviceToHost) );
+  checkCudaErrors( cudaDeviceSynchronize() );
+
+  return 1;
+}
+
+
 int parallelDigammas(float *digammas, int *nx, int *ny, int signalLength) {

  int *d_nx, *d_ny;
@ -560,6 +631,53 @@ int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
  return 1;
 }

+int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
+    float *d_digammas, int trialLength, int nchunks) {
+
+  float *d_sumDigammas;
+  int signalLength = trialLength * nchunks;
+
+  // Kernel parameters
+  dim3 threads(1,1,1);
+  dim3 grid(1,1,1);
+  threads.x = 512;
+  grid.x = (signalLength-1)/threads.x + 1;
+  checkCudaErrors( cudaMalloc((void **) &d_sumDigammas, nchunks * sizeof(int)) );
+
+  // Launch kernel to calculate (digamma(nx+1) + digamma(ny+1)), and leave
+  // results in GPU
+  gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
+  checkCudaErrors( cudaDeviceSynchronize() );
+
+  int offset_size = nchunks + 1;
+  int offsets[offset_size];
+  for (int i = 0; i < (nchunks+1); i++) { offsets[i] = i*trialLength; }
+  int *d_offsets;
+  checkCudaErrors(cudaMalloc((void **) &d_offsets, (nchunks + 1)*sizeof(int)));
+  checkCudaErrors(cudaMemcpy(d_offsets, offsets, (nchunks + 1)*sizeof(int), cudaMemcpyHostToDevice));
+
+  void     *d_temp_storage = NULL;
+  size_t   temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
+          nchunks, d_offsets, d_offsets + 1);
+  // Allocate temporary storage
+  checkCudaErrors( cudaMalloc(&d_temp_storage, temp_storage_bytes) );
+  checkCudaErrors( cudaDeviceSynchronize() );
+
+  // Run sum-reduction
+  cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
+      nchunks, d_offsets, d_offsets + 1);
+  checkCudaErrors( cudaDeviceSynchronize() );
+
+  checkCudaErrors( cudaFree(d_temp_storage) );
+  checkCudaErrors( cudaFree(d_offsets) );
+
+  checkCudaErrors(cudaMemcpy(sumDigammas, d_sumDigammas, nchunks*sizeof(float), cudaMemcpyDeviceToHost));
+  checkCudaErrors( cudaFree(d_sumDigammas) );
+
+  return 1;
+}
+
 /**
 * Calculate and sum digammas in chunks fully in GPU.
 */
--- a/cuda/gpuKnnLibrary.h
+++ b/cuda/gpuKnnLibrary.h
@ -10,6 +10,10 @@ int allocateDeviceMemory(int signalLength, int kth, int dimx, int dimy,
    float **source, float **dest, float **distances, int **indexes,
    float **radii, int **nx, int **ny, float **digammas, float *pointset);

+int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
+    float **source, float **dest, float **cond, float **distances, int **indexes,
+    float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset);
+
 int freeDeviceMemory(float *d_pointset);

 int cudaFindKnn(int* h_bf_indexes, float* h_bf_distances, float* h_pointset,
@ -49,9 +53,15 @@ int d_cudaFindRSAll(int* d_bf_npointsrange, float* d_bf_pointset, float* d_bf_qu
 int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
    int *d_ny, int signalLength);

+int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
+    int *d_ny, int *d_nz, int signalLength);
+
 int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
    float *d_digammas, int trialLength, int nchunks);

+int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
+    float *d_digammas, int trialLength, int nchunks);
+
 void device_reset(void);

 void gpuWarmUp(void);