jidt/cuda/gpuKnnBF_kernel.cu

502 lines
14 KiB
Plaintext

#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
//#include <cpugpuKnn_common.h>
#ifndef INFINITY
#define INFINITY 0x7F800000
#endif
/**
* Typedef function pointer for arbitrary norm functions.
*/
typedef float (*normFunction_t)(const float* g_uquery, const float* g_vpoint,
int pointdim, int signallength);
/**
* Calculate max norm (L-inf) between two points.
*/
__device__ __host__ float
maxMetricPoints(const float* g_uquery, const float* g_vpoint, int pointdim, int signallength){
float r_u1;
float r_v1;
float r_d1,r_dim=0;
r_dim=0;
for(int d=0; d<pointdim; d++){
r_u1 = *(g_uquery+d*signallength);
r_v1 = *(g_vpoint+d*signallength);
r_d1 = fabsf(r_v1 - r_u1);
r_dim= r_dim < r_d1? r_d1: r_dim;
}
return r_dim;
}
/**
* Calculate squared Euclidean norm (L2) between two points (note we keep the
* distance squared to avoid taking sqrt all the time).
*/
__device__ __host__ float
squareMetricPoints(const float* g_uquery, const float* g_vpoint, int pointdim, int signallength){
float r_u1;
float r_v1;
float r_d1, r_dim = 0.0f;
for (int d = 0; d < pointdim; d++) {
r_u1 = *(g_uquery+d*signallength);
r_v1 = *(g_vpoint+d*signallength);
r_d1 = r_v1 - r_u1;
r_dim += r_d1 * r_d1;
}
return r_dim;
}
__device__ normFunction_t pMaxNorm = maxMetricPoints;
__device__ normFunction_t pSquareNorm = squareMetricPoints;
/**
* Insert point in current list of nearest neighbours.
*/
__device__ float
insertPointKlist(int kth, float distance, int indexv,float* kdistances, int* kindexes){
int k=0;
while( (distance>*(kdistances+k)) && (k<kth-1)){k++;}
//Move value to the next
for(int k2=kth-1;k2>k;k2--){
*(kdistances+k2)=*(kdistances+k2-1);
*(kindexes+k2)=*(kindexes+k2-1);
}
//Replace
*(kdistances+k)=distance;
*(kindexes+k)=indexv;
//printf("\n -> Modificacion pila: %.f %.f. New max distance: %.f", *kdistances, *(kdistances+1), *(kdistances+kth-1));
return *(kdistances+kth-1);
}
/*
* Main KNN kernel. Find the nearest k neighbours to each point according
* to the supplied norm function.
*/
__global__ void
kernelKNNshared(const float* g_uquery, const float* g_vpointset,
int *g_indexes, float* g_distances, const int pointdim,
const int triallength, const int signallength, const int kth,
const int exclude, normFunction_t *normFunction) {
// Shared memory
extern __shared__ char array[];
float *kdistances;
int *kindexes;
kdistances = (float*)array;
kindexes = (int*)array+kth*blockDim.x;
const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
const unsigned int itrial = tid / triallength; // indextrial
if(tid<signallength){
for(int k=0;k<kth;k++){
kdistances[threadIdx.x*kth+k] = INFINITY;
}
__syncthreads();
float r_kdist=INFINITY;
unsigned int indexi = tid-triallength*itrial;
for(int t=0; t<triallength; t++){
int indexu = tid;
int indexv = (t + itrial*triallength);
int condition1=indexi-exclude;
int condition2=indexi+exclude;
if((t<condition1)||(t>condition2)){
float temp_dist = normFunction[0](g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
if(temp_dist <= r_kdist){
r_kdist = insertPointKlist(kth,temp_dist,t,kdistances+threadIdx.x*kth,kindexes+threadIdx.x*kth);
}
}
//printf("tid:%d indexes: %d, %d distances: %.f %.f\n",tid, *kindexes, *(kindexes+1), *kdistances, *(kdistances+1));
}
__syncthreads();
// COPY TO GLOBAL MEMORY
for (int k = 0; k < kth; k++) {
g_indexes[tid+k*signallength] = kindexes[threadIdx.x*kth+k];
g_distances[tid+k*signallength] = kdistances[threadIdx.x*kth+k];//*(kdistances+k);
}
}
}
/*
* Range search for one data point using bruteforce.
*/
__global__ void
kernelBFRSshared(const float* g_uquery, const float* g_vpointset, int *g_npoints, int pointdim, int triallength, int signallength, int exclude, float radius)
{
// Shared memory
extern __shared__ char array[];
int *s_npointsrange;
s_npointsrange = (int*)array;
const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
const unsigned int itrial = tid / triallength; // indextrial
if(tid<signallength){
s_npointsrange[threadIdx.x] = 0;
__syncthreads();
unsigned int indexi = tid-triallength*itrial;
for(int t=0; t<triallength; t++){
int indexu = tid;
int indexv = (t + itrial*triallength);
int condition1=indexi-exclude;
int condition2=indexi+exclude;
if((t<condition1)||(t>condition2)){
float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
if(temp_dist <= radius){
s_npointsrange[threadIdx.x]++;
}
}
}
__syncthreads();
// COPY TO GLOBAL MEMORY
g_npoints[tid] = s_npointsrange[threadIdx.x];
}
}
/*
* Range search using bruteforce in multiple GPUs
*/
__global__ void
kernelBFRSMultishared(const float* g_uquery, const float* g_vpointset, int *g_npoints, int pointdim, int triallength, int signallength, int exclude, const float* vecradius)
{
// shared memory
extern __shared__ char array[];
int *s_npointsrange;
s_npointsrange = (int*)array;
float radius=0;
const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
const unsigned int itrial = tid / triallength; // indextrial
if(tid<signallength){
s_npointsrange[threadIdx.x] = 0;
__syncthreads();
radius = *(vecradius+itrial);
unsigned int indexi = tid-triallength*itrial;
for(int t=0; t<triallength; t++){
int indexu = tid;
int indexv = (t + itrial*triallength);
int condition1=indexi-exclude;
int condition2=indexi+exclude;
if((t<condition1)||(t>condition2)){
float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
if(temp_dist <= radius){
s_npointsrange[threadIdx.x]++;
}
}
}
__syncthreads();
//printf("\ntid:%d npoints: %d\n",tid, s_npointsrange[threadIdx.x]);
//COPY TO GLOBAL MEMORY
g_npoints[tid] = s_npointsrange[threadIdx.x];
}
}
/*
* Range search for all data points using bruteforce.
*/
__global__ void
kernelBFRSAllshared(const float* g_uquery, const float* g_vpointset,
int *g_npoints, int pointdim, int triallength, int signallength,
int exclude, const float* vecradius, normFunction_t *normFunction) {
// Shared memory
extern __shared__ char array[];
int *s_npointsrange;
s_npointsrange = (int *) array;
float radius = 0;
const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
const unsigned int itrial = tid / triallength; // indextrial
if(tid<signallength){
s_npointsrange[threadIdx.x] = 0;
__syncthreads();
radius = *(vecradius+tid);
unsigned int indexi = tid-triallength*itrial;
for (int t=0; t<triallength; t++){
// Note: the following two definitions could be swapped depending on
// the details of surrogate implementation
int indexu = tid; // old, necessary for shuffled part of the surrogates (i.e. source)
// int indexu = indexi; // new, admissible for unshuffled part of the surrogates (i.e. dest)
int indexv = (t + itrial*triallength);
int condition1=indexi-exclude;
int condition2=indexi+exclude;
if((t<condition1)||(t>condition2)){
float temp_dist = normFunction[0](g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
// PEDRO: For KSG algorithm 1 this should be strictly less than R, and in TRENTOOL code it's less or equal. It's a float comparison, so I don't think it matters anyway.
if(temp_dist < radius){
s_npointsrange[threadIdx.x]++;
}
}
}
__syncthreads();
//COPY TO GLOBAL MEMORY
g_npoints[tid] = s_npointsrange[threadIdx.x];
}
}
/**
* Taken from NVIDIA dev forum:
* https://devtalk.nvidia.com/default/topic/516516/kernel-launch-failure-in-matlab/
*/
__device__ void digammaXp1(double *y) {
// double x = *y;
double x = *y + 1;
double neginf = -INFINITY;
const double c = 12,
digamma1 = -0.57721566490153286,
trigamma1 = 1.6449340668482264365, /* pi^2/6 */
s = 1e-6,
s3 = 1./12,
s4 = 1./120,
s5 = 1./252,
s6 = 1./240,
s7 = 1./132;
// s8 = 691./32760,
// s9 = 1./12,
// s10 = 3617./8160;
double result;
/* Illegal arguments */
if((x == neginf) || isnan(x)) {
*y = NAN;
return;
}
/* Singularities */
if((x <= 0) && (floor(x) == x)) {
*y = neginf;
return;
}
/* Negative values */
/* Use the reflection formula (Jeffrey 11.1.6):
* digamma(-x) = digamma(x+1) + pi*cot(pi*x)
*
* This is related to the identity
* digamma(-x) = digamma(x+1) - digamma(z) + digamma(1-z)
* where z is the fractional part of x
* For example:
* digamma(-3.1) = 1/3.1 + 1/2.1 + 1/1.1 + 1/0.1 + digamma(1-0.1)
* = digamma(4.1) - digamma(0.1) + digamma(1-0.1)
* Then we use
* digamma(1-z) - digamma(z) = pi*cot(pi*z)
*
if(x < 0) {
*p = digamma(p,1-x) + M_PI/tan(-M_PI*x);
return;
}
*/
/* Use Taylor series if argument <= S */
if(x <= s) {
*y = digamma1 - 1/x + trigamma1*x;
return;
}
/* Reduce to digamma(X + N) where (X + N) >= C */
result = 0;
while(x < c) {
result -= 1/x;
x++;
}
/* Use de Moivre's expansion if argument >= C */
/* This expansion can be computed in Maple via asympt(Psi(x),x) */
if(x >= c) {
double r = 1/x, t;
result += log(x) - 0.5*r;
r *= r;
#if 0
result -= r * (s3 - r * (s4 - r * (s5 - r * (s6 - r * s7))));
#else
/* this version for lame compilers */
t = (s5 - r * (s6 - r * s7));
result -= r * (s3 - r * (s4 - r * t));
#endif
}
/* assign the result to the pointer*/
*y = result;
return;
}
/**
* Optimized reduction kernel, obtained from Mark Harris' lecture on
* GPU optimization:
* https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
*/
template <unsigned int blockSize>
// __device__ void warpReduce(volatile int *sdata, unsigned int tid) {
__device__ void warpReduce(volatile float *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
// template <unsigned int blockSize>
// __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) {
__global__ void reduce6(int *g_nx, int *g_ny, float *g_odata, unsigned int n) {
// extern __shared__ int sdata[];
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
const unsigned int blockSize = 512;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i<n) {
double dgX1 = (double) g_nx[i];
double dgY1 = (double) g_ny[i];
double dgX2 = (double) g_nx[i+blockSize];
double dgY2 = (double) g_ny[i+blockSize];
digammaXp1(&dgX1);
digammaXp1(&dgY1);
digammaXp1(&dgX2);
digammaXp1(&dgY2);
sdata[tid] = dgX1 + dgY1 + dgX2 + dgY2;
i += gridSize;
}
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce<blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
__global__ void reduce1(int *g_nx, int *g_ny, float *g_odata, unsigned int N) {
extern __shared__ float sdata2[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) {
double dgX = (double) g_nx[i];
double dgY = (double) g_ny[i];
digammaXp1(&dgX);
digammaXp1(&dgY);
sdata2[tid] = (float) (dgX + dgY);
} else {
sdata2[tid] = 0;
}
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata2[tid] += sdata2[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) {
g_odata[blockIdx.x] = sdata2[0];
}
}
__global__ void gpuDigammas(float *g_digammas, int *g_nx, int *g_ny, int signallength) {
const unsigned int i = threadIdx.x + blockDim.x*blockIdx.x;
if(i < signallength){
// Fetch n and put it in thread memory
double dgX = (double) g_nx[i];
double dgY = (double) g_ny[i];
// In-place digamma calculation
digammaXp1(&dgX);
digammaXp1(&dgY);
// Copy back to global memory
g_digammas[i] = (float) (dgX + dgY);
}
return;
}
__global__ void gpuDigammasCMI(float *g_digammas, int *g_nx, int *g_ny, int *g_nz, int signallength) {
const unsigned int i = threadIdx.x + blockDim.x*blockIdx.x;
if(i < signallength){
// Fetch n and put it in thread memory
double dgX = (double) g_nx[i];
double dgY = (double) g_ny[i];
double dgZ = (double) g_nz[i];
// In-place digamma calculation
digammaXp1(&dgX);
digammaXp1(&dgY);
digammaXp1(&dgZ);
// Copy back to global memory
g_digammas[i] = (float) (dgX + dgY - dgZ);
}
return;
}
#endif // #ifndef _TEMPLATE_KERNEL_H_