mirror of https://github.com/jlizier/jidt
Added versions of gpuKnn functions adapted for CMI. Refactoring needed.
This commit is contained in:
parent
1fb4015df5
commit
f1bd757dc0
|
@ -469,6 +469,26 @@ __global__ void gpuDigammas(float *g_digammas, int *g_nx, int *g_ny, int signall
|
|||
}
|
||||
|
||||
|
||||
__global__ void gpuDigammasCMI(float *g_digammas, int *g_nx, int *g_ny, int *g_nz, int signallength) {
|
||||
const unsigned int i = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
if(i < signallength){
|
||||
// Fetch n and put it in thread memory
|
||||
double dgX = (double) g_nx[i];
|
||||
double dgY = (double) g_ny[i];
|
||||
double dgZ = (double) g_nz[i];
|
||||
|
||||
// In-place digamma calculation
|
||||
digammaXp1(&dgX);
|
||||
digammaXp1(&dgY);
|
||||
digammaXp1(&dgZ);
|
||||
|
||||
// Copy back to global memory
|
||||
g_digammas[i] = (float) (dgX + dgY - dgZ);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -57,6 +57,57 @@ int allocateDeviceMemory(int signalLength, int k, int dimx, int dimy,
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate all necessary memory for the whole CMI calculation in a single call
|
||||
* to cudaMalloc, and point the pointers to the right place.
|
||||
*
|
||||
* @param signalLength total number of samples given, including surrogates
|
||||
* @param k nunmber of neighbours to find
|
||||
* @param dimx dimension of source points
|
||||
* @param dimy dimension of dest points
|
||||
* @param dimz dimension of cond points
|
||||
* @param source,dest,cond,distances,indexes,radii,nx,ny,nz,digammas device pointers
|
||||
* @param pointset pointer to the data array in host memory
|
||||
*
|
||||
* @return error code
|
||||
*/
|
||||
int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
|
||||
float **source, float **dest, float **cond, float **distances, int **indexes,
|
||||
float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset) {
|
||||
|
||||
float *d_pointset;
|
||||
int dims = dimx + dimy + dimz;
|
||||
size_t mempointset = signalLength * dims * sizeof(float);
|
||||
size_t memdistances = signalLength * k * sizeof(float);
|
||||
size_t memindexes = signalLength * k * sizeof(int);
|
||||
size_t memcounts = 3 * signalLength * sizeof(int);
|
||||
size_t memdigammas = signalLength * sizeof(float);
|
||||
size_t memtotal = mempointset + memdistances + memindexes + memcounts + memdigammas;
|
||||
|
||||
checkCudaErrors( cudaMalloc((void **) &d_pointset, memtotal) );
|
||||
|
||||
cudaError_t error = cudaGetLastError();
|
||||
if(error!=cudaSuccess){
|
||||
fprintf(stderr,"%s",cudaGetErrorString(error));
|
||||
return 0;
|
||||
}
|
||||
|
||||
checkCudaErrors( cudaMemcpy(d_pointset, pointset, mempointset, cudaMemcpyHostToDevice) );
|
||||
|
||||
*source = d_pointset;
|
||||
*cond = *source + signalLength*dimx;
|
||||
*dest = *cond + signalLength*dimz;
|
||||
*distances = *dest + signalLength*dimy;
|
||||
*radii = *distances + (k-1)*signalLength;
|
||||
*indexes = (int *) (*distances + k*signalLength);
|
||||
*nx = *indexes + signalLength;
|
||||
*ny = *nx + signalLength;
|
||||
*nz = *ny + signalLength;
|
||||
*digammas = (float *) (*nz + signalLength);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Free all the memory used in GPU (if allocated using allocateDeviceMemory.
|
||||
*
|
||||
|
@ -480,6 +531,26 @@ int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
|
|||
}
|
||||
|
||||
|
||||
int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
|
||||
int *d_ny, int *d_nz, int signalLength) {
|
||||
|
||||
// Kernel parameters
|
||||
dim3 threads(1,1,1);
|
||||
dim3 grid(1,1,1);
|
||||
threads.x = 512;
|
||||
grid.x = (signalLength-1)/threads.x + 1;
|
||||
|
||||
// Launch kernel
|
||||
gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
|
||||
checkCudaErrors( cudaDeviceSynchronize() );
|
||||
|
||||
checkCudaErrors( cudaMemcpy(digammas, d_digammas, signalLength * sizeof(float), cudaMemcpyDeviceToHost) );
|
||||
checkCudaErrors( cudaDeviceSynchronize() );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int parallelDigammas(float *digammas, int *nx, int *ny, int signalLength) {
|
||||
|
||||
int *d_nx, *d_ny;
|
||||
|
@ -560,6 +631,53 @@ int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
|
|||
return 1;
|
||||
}
|
||||
|
||||
int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
|
||||
float *d_digammas, int trialLength, int nchunks) {
|
||||
|
||||
float *d_sumDigammas;
|
||||
int signalLength = trialLength * nchunks;
|
||||
|
||||
// Kernel parameters
|
||||
dim3 threads(1,1,1);
|
||||
dim3 grid(1,1,1);
|
||||
threads.x = 512;
|
||||
grid.x = (signalLength-1)/threads.x + 1;
|
||||
checkCudaErrors( cudaMalloc((void **) &d_sumDigammas, nchunks * sizeof(int)) );
|
||||
|
||||
// Launch kernel to calculate (digamma(nx+1) + digamma(ny+1)), and leave
|
||||
// results in GPU
|
||||
gpuDigammasCMI<<<grid.x, threads.x>>>(d_digammas, d_nx, d_ny, d_nz, signalLength);
|
||||
checkCudaErrors( cudaDeviceSynchronize() );
|
||||
|
||||
int offset_size = nchunks + 1;
|
||||
int offsets[offset_size];
|
||||
for (int i = 0; i < (nchunks+1); i++) { offsets[i] = i*trialLength; }
|
||||
int *d_offsets;
|
||||
checkCudaErrors(cudaMalloc((void **) &d_offsets, (nchunks + 1)*sizeof(int)));
|
||||
checkCudaErrors(cudaMemcpy(d_offsets, offsets, (nchunks + 1)*sizeof(int), cudaMemcpyHostToDevice));
|
||||
|
||||
void *d_temp_storage = NULL;
|
||||
size_t temp_storage_bytes = 0;
|
||||
cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
|
||||
nchunks, d_offsets, d_offsets + 1);
|
||||
// Allocate temporary storage
|
||||
checkCudaErrors( cudaMalloc(&d_temp_storage, temp_storage_bytes) );
|
||||
checkCudaErrors( cudaDeviceSynchronize() );
|
||||
|
||||
// Run sum-reduction
|
||||
cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_digammas, d_sumDigammas,
|
||||
nchunks, d_offsets, d_offsets + 1);
|
||||
checkCudaErrors( cudaDeviceSynchronize() );
|
||||
|
||||
checkCudaErrors( cudaFree(d_temp_storage) );
|
||||
checkCudaErrors( cudaFree(d_offsets) );
|
||||
|
||||
checkCudaErrors(cudaMemcpy(sumDigammas, d_sumDigammas, nchunks*sizeof(float), cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors( cudaFree(d_sumDigammas) );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate and sum digammas in chunks fully in GPU.
|
||||
*/
|
||||
|
|
|
@ -10,6 +10,10 @@ int allocateDeviceMemory(int signalLength, int kth, int dimx, int dimy,
|
|||
float **source, float **dest, float **distances, int **indexes,
|
||||
float **radii, int **nx, int **ny, float **digammas, float *pointset);
|
||||
|
||||
int allocateDeviceMemoryCMI(int signalLength, int k, int dimx, int dimy, int dimz,
|
||||
float **source, float **dest, float **cond, float **distances, int **indexes,
|
||||
float **radii, int **nx, int **ny, int **nz, float **digammas, float *pointset);
|
||||
|
||||
int freeDeviceMemory(float *d_pointset);
|
||||
|
||||
int cudaFindKnn(int* h_bf_indexes, float* h_bf_distances, float* h_pointset,
|
||||
|
@ -49,9 +53,15 @@ int d_cudaFindRSAll(int* d_bf_npointsrange, float* d_bf_pointset, float* d_bf_qu
|
|||
int d_parallelDigammas(float *digammas, float *d_digammas, int *d_nx,
|
||||
int *d_ny, int signalLength);
|
||||
|
||||
int d_parallelDigammasCMI(float *digammas, float *d_digammas, int *d_nx,
|
||||
int *d_ny, int *d_nz, int signalLength);
|
||||
|
||||
int d_cudaSumDigammas(float *sumDigammas, int *d_nx, int *d_ny,
|
||||
float *d_digammas, int trialLength, int nchunks);
|
||||
|
||||
int d_cudaSumDigammasCMI(float *sumDigammas, int *d_nx, int *d_ny, int *d_nz,
|
||||
float *d_digammas, int trialLength, int nchunks);
|
||||
|
||||
void device_reset(void);
|
||||
|
||||
void gpuWarmUp(void);
|
||||
|
|
Loading…
Reference in New Issue