diff --git a/libclc/generic/include/clc/clc.h b/libclc/generic/include/clc/clc.h index 45b107cb86a3..0fc8530e9949 100644 --- a/libclc/generic/include/clc/clc.h +++ b/libclc/generic/include/clc/clc.h @@ -32,6 +32,7 @@ #include /* 6.11.2 Math Functions */ +#include #include #include #include diff --git a/libclc/generic/include/clc/math/acos.h b/libclc/generic/include/clc/math/acos.h new file mode 100644 index 000000000000..e753dee36aa5 --- /dev/null +++ b/libclc/generic/include/clc/math/acos.h @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff --git a/libclc/generic/include/clc/math/acos.inc b/libclc/generic/include/clc/math/acos.inc new file mode 100644 index 000000000000..4ca8c7538aef --- /dev/null +++ b/libclc/generic/include/clc/math/acos.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE acos(__CLC_GENTYPE x); diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index 22fc1fb22920..577671cc216c 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -29,6 +29,7 @@ integer/sub_sat.cl integer/sub_sat_if.ll integer/sub_sat_impl.ll integer/upsample.cl +math/acos.cl math/atan.cl math/atan2.cl math/copysign.cl diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl new file mode 100644 index 000000000000..3ce96554fef3 --- /dev/null +++ b/libclc/generic/lib/math/acos.cl @@ -0,0 +1,8 @@ +#include + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY +#include diff --git a/libclc/generic/lib/math/acos.inc b/libclc/generic/lib/math/acos.inc new file mode 100644 index 000000000000..8612415f37bd --- /dev/null +++ b/libclc/generic/lib/math/acos.inc @@ -0,0 +1,21 @@ +/* + * There are multiple formulas for calculating arccosine of x: + * 1) acos(x) = (1/2*pi) + i * ln(i*x + sqrt(1-x^2)) (notice the 'i'...) + * 2) acos(x) = pi/2 + asin(-x) (asin isn't implemented yet) + * 3) acos(x) = pi/2 - asin(x) (ditto) + * 4) acos(x) = 2*atan2(sqrt(1-x), sqrt(1+x)) + * 5) acos(x) = pi/2 - atan2(x, ( sqrt(1-x^2) ) ) + * + * Options 1-3 are not currently usable, #5 generates more concise radeonsi + * bitcode and assembly than #4 (134 vs 132 instructions on radeonsi), but + * precision of #4 may be better. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE acos(__CLC_GENTYPE x) { + return ( + (__CLC_GENTYPE) 2.0 * atan2( + sqrt((__CLC_GENTYPE) 1.0 - x), + sqrt((__CLC_GENTYPE) 1.0 + x) + ) + ); +}