Implement cos for double types

This implementation was ported from the AMD builtin library
and has been tested with piglit, OpenCV, and the ocl conformance tests.

llvm-svn: 237154
This commit is contained in:
Tom Stellard 2015-05-12 17:18:46 +00:00
parent e0de8f4efb
commit 2e6ff0c66e
5 changed files with 289 additions and 7 deletions

View File

@ -52,14 +52,24 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define __CLC_FUNCTION __clc_cos_intrinsic
#define __CLC_INTRINSIC "llvm.cos"
#include <clc/math/unary_intrin.inc>
#undef __CLC_FUNCTION
#undef __CLC_INTRINSIC
_CLC_OVERLOAD _CLC_DEF double cos(double x) {
return __clc_cos_intrinsic(x);
x = fabs(x);
double r, rr;
int regn;
if (x < 0x1.0p+47)
__clc_remainder_piby2_medium(x, &r, &rr, &regn);
else
__clc_remainder_piby2_large(x, &r, &rr, &regn);
double2 sc = __clc_sincos_piby4(r, rr);
sc.lo = -sc.lo;
int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
c.hi ^= (regn > 1) << 31;
return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
}
_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);

View File

@ -23,11 +23,15 @@
#include <clc/clc.h>
#include "math.h"
#include "tables.h"
#include "sincos_helpers.h"
#define bitalign(hi, lo, shift) \
((hi) << (32 - (shift))) | ((lo) >> (shift));
#define bytealign(src0, src1, src2) \
((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
_CLC_DEF float __clc_sinf_piby4(float x, float y) {
// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
@ -302,3 +306,240 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
return __clc_argReductionLargeS(r, rr, x);
}
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
// Reduction for medium sized arguments
_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
// How many pi/2 is x a multiple of?
const double two_by_pi = 0x1.45f306dc9c883p-1;
double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
// Compute product of npi2 with 159 bits of 2/pi
double p_hh = piby2_h * dnpi2;
double p_ht = fma(piby2_h, dnpi2, -p_hh);
double p_mh = piby2_m * dnpi2;
double p_mt = fma(piby2_m, dnpi2, -p_mh);
double p_th = piby2_t * dnpi2;
double p_tt = fma(piby2_t, dnpi2, -p_th);
// Reduce to 159 bits
double ph = p_hh;
double pm = p_ht + p_mh;
double t = p_mh - (pm - p_ht);
double pt = p_th + t + p_mt + p_tt;
t = ph + pm; pm = pm - (t - ph); ph = t;
t = pm + pt; pt = pt - (t - pm); pm = t;
// Subtract from x
t = x + ph;
double qh = t + pm;
double qt = pm - (qh - t) + pt;
*r = qh;
*rr = qt;
*regn = (int)(long)dnpi2 & 0x3;
}
// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
// extra precision, and return the result in r, rr.
// Return value "regn" tells how many lots of pi/2 were subtracted
// from x to put it in the range [-pi/4,pi/4], mod 4.
_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
long ux = as_long(x);
int e = (int)(ux >> 52) - 1023;
int i = max(23, (e >> 3) + 17);
int j = 150 - i;
int j16 = j & ~0xf;
double fract_temp;
// The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
uint4 q0 = USE_TABLE(pibits_tbl, j16);
uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
int k = (j >> 2) & 0x3;
int4 c = (int4)k == (int4)(0, 1, 2, 3);
uint u0, u1, u2, u3, u4, u5, u6;
u0 = c.s1 ? q0.s1 : q0.s0;
u0 = c.s2 ? q0.s2 : u0;
u0 = c.s3 ? q0.s3 : u0;
u1 = c.s1 ? q0.s2 : q0.s1;
u1 = c.s2 ? q0.s3 : u1;
u1 = c.s3 ? q1.s0 : u1;
u2 = c.s1 ? q0.s3 : q0.s2;
u2 = c.s2 ? q1.s0 : u2;
u2 = c.s3 ? q1.s1 : u2;
u3 = c.s1 ? q1.s0 : q0.s3;
u3 = c.s2 ? q1.s1 : u3;
u3 = c.s3 ? q1.s2 : u3;
u4 = c.s1 ? q1.s1 : q1.s0;
u4 = c.s2 ? q1.s2 : u4;
u4 = c.s3 ? q1.s3 : u4;
u5 = c.s1 ? q1.s2 : q1.s1;
u5 = c.s2 ? q1.s3 : u5;
u5 = c.s3 ? q2.s0 : u5;
u6 = c.s1 ? q1.s3 : q1.s2;
u6 = c.s2 ? q2.s0 : u6;
u6 = c.s3 ? q2.s1 : u6;
uint v0 = bytealign(u1, u0, j);
uint v1 = bytealign(u2, u1, j);
uint v2 = bytealign(u3, u2, j);
uint v3 = bytealign(u4, u3, j);
uint v4 = bytealign(u5, u4, j);
uint v5 = bytealign(u6, u5, j);
// Place those 192 bits in 4 48-bit doubles along with correct exponent
// If i > 1018 we would get subnormals so we scale p up and x down to get the same product
i = 2 + 8*i;
x *= i > 1018 ? 0x1.0p-136 : 1.0;
i -= i > 1018 ? 136 : 0;
uint ua = (uint)(1023 + 52 - i) << 20;
double a = as_double((uint2)(0, ua));
double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
ua += 0x03000000U;
a = as_double((uint2)(0, ua));
double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
ua += 0x03000000U;
a = as_double((uint2)(0, ua));
double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
ua += 0x03000000U;
a = as_double((uint2)(0, ua));
double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
// Exact multiply
double f0h = p0 * x;
double f0l = fma(p0, x, -f0h);
double f1h = p1 * x;
double f1l = fma(p1, x, -f1h);
double f2h = p2 * x;
double f2l = fma(p2, x, -f2h);
double f3h = p3 * x;
double f3l = fma(p3, x, -f3h);
// Accumulate product into 4 doubles
double s, t;
double f3 = f3h + f2h;
t = f2h - (f3 - f3h);
s = f3l + t;
t = t - (s - f3l);
double f2 = s + f1h;
t = f1h - (f2 - s) + t;
s = f2l + t;
t = t - (s - f2l);
double f1 = s + f0h;
t = f0h - (f1 - s) + t;
s = f1l + t;
double f0 = s + f0l;
// Strip off unwanted large integer bits
f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
// Compute least significant integer bits
t = f3 + f2;
double di = t - fract(t, &fract_temp);
i = (float)di;
// Shift out remaining integer part
f3 -= di;
s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
f1 += f0;
// Subtract 1 if fraction is >= 0.5, and update regn
int g = f3 >= 0.5;
i += g;
f3 -= (float)g;
// Shift up bits
s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
// Multiply precise fraction by pi/2 to get radians
const double p2h = 7074237752028440.0 / 0x1.0p+52;
const double p2t = 4967757600021510.0 / 0x1.0p+106;
double rhi = f3 * p2h;
double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
*r = rhi + rlo;
*rr = rlo - (*r - rhi);
*regn = i & 0x3;
}
_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
// = x * f(w)
// where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
// We use a minimax approximation of (f(w) - 1) / w
// because this produces an expansion in even powers of x.
// If xx (the tail of x) is non-zero, we add a correction
// term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
// is an approximation to cos(x)*sin(xx) valid because
// xx is tiny relative to x.
// Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
// = f(w)
// where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
// We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
// because this produces an expansion in even powers of x.
// If xx (the tail of x) is non-zero, we subtract a correction
// term g(x,xx) = x*xx to the result, where g(x,xx)
// is an approximation to sin(x)*sin(xx) valid because
// xx is tiny relative to x.
const double sc1 = -0.166666666666666646259241729;
const double sc2 = 0.833333333333095043065222816e-2;
const double sc3 = -0.19841269836761125688538679e-3;
const double sc4 = 0.275573161037288022676895908448e-5;
const double sc5 = -0.25051132068021699772257377197e-7;
const double sc6 = 0.159181443044859136852668200e-9;
const double cc1 = 0.41666666666666665390037e-1;
const double cc2 = -0.13888888888887398280412e-2;
const double cc3 = 0.248015872987670414957399e-4;
const double cc4 = -0.275573172723441909470836e-6;
const double cc5 = 0.208761463822329611076335e-8;
const double cc6 = -0.113826398067944859590880e-10;
double x2 = x * x;
double x3 = x2 * x;
double r = 0.5 * x2;
double t = 1.0 - r;
double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
x2*x2, fma(x, xx, (1.0 - t) - r));
double2 ret;
ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
ret.hi = cp;
return ret;
}
#endif

View File

@ -23,3 +23,13 @@
_CLC_DECL float __clc_sinf_piby4(float x, float y);
_CLC_DECL float __clc_cosf_piby4(float x, float y);
_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
#endif

View File

@ -288,9 +288,29 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
0x1.000000p+0f,
};
DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
230, 139, 2, 0, 0, 0, 0, 0, 0, 0
};
TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
return *(__constant uint4 *)(PIBITS_TBL + idx);
}
#ifdef cl_khr_fp64
DECLARE_TABLE(double2, LN_TBL, 65) = {

View File

@ -40,6 +40,7 @@
TABLE_FUNCTION_DECL(float2, loge_tbl);
TABLE_FUNCTION_DECL(float, log_inv_tbl);
TABLE_FUNCTION_DECL(uint4, pibits_tbl);
#ifdef cl_khr_fp64