forked from OSchip/llvm-project
[libc][math] Simplify tanf implementation and improve its performance.
Simplify `tanf` implementation and improve its performance. Completely reuse the implementation of `sinf`, `cosf`, `sincosf` and use the definition `tan(x) = sin(x)/cos(x)`. Performance benchmark using perf tool from the CORE-MATH project on Ryzen 1700: ``` $ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh tanf GNU libc version: 2.35 GNU libc release: stable CORE-MATH reciprocal throughput : 18.558 System LIBC reciprocal throughput : 49.919 BEFORE: LIBC reciprocal throughput : 36.480 LIBC reciprocal throughput : 27.217 (with `-msse4.2` flag) LIBC reciprocal throughput : 20.205 (with `-mfma` flag) AFTER: LIBC reciprocal throughput : 30.337 LIBC reciprocal throughput : 21.072 (with `-msse4.2` flag) LIBC reciprocal throughput : 15.804 (with `-mfma` flag) $ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh tanf --latency GNU libc version: 2.35 GNU libc release: stable CORE-MATH latency : 56.702 System LIBC latency : 107.206 BEFORE LIBC latency : 97.598 LIBC latency : 91.119 (with `-msse4.2` flag) LIBC latency : 82.655 (with `-mfma` flag) AFTER LIBC latency : 74.560 LIBC latency : 66.575 (with `-msse4.2` flag) LIBC latency : 61.636 (with `-mfma` flag) ``` Reviewed By: zimmermann6 Differential Revision: https://reviews.llvm.org/D134575
This commit is contained in:
parent
f3e02989e6
commit
e15b2da42f
|
@ -251,7 +251,7 @@ Performance
|
|||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||
| sinhf | 13 | 63 | 48 | 137 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA |
|
||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||
| tanf | 19 | 50 | 82 | 107 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||
| tanf | 16 | 50 | 61 | 107 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA |
|
||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||
| tanhf | 13 | 55 | 57 | 123 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA |
|
||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||
|
|
|
@ -138,6 +138,7 @@ add_entrypoint_object(
|
|||
../tanf.h
|
||||
DEPENDS
|
||||
.range_reduction
|
||||
.sincosf_utils
|
||||
libc.include.math
|
||||
libc.src.errno.errno
|
||||
libc.src.__support.FPUtil.fenv_impl
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/math/tanf.h"
|
||||
#include "sincosf_utils.h"
|
||||
#include "src/__support/FPUtil/FEnvImpl.h"
|
||||
#include "src/__support/FPUtil/FPBits.h"
|
||||
#include "src/__support/FPUtil/PolyEval.h"
|
||||
|
@ -17,115 +18,32 @@
|
|||
|
||||
#include <errno.h>
|
||||
|
||||
#if defined(LIBC_TARGET_HAS_FMA)
|
||||
#include "range_reduction_fma.h"
|
||||
// using namespace __llvm_libc::fma;
|
||||
using __llvm_libc::fma::FAST_PASS_BOUND;
|
||||
using __llvm_libc::fma::large_range_reduction;
|
||||
using __llvm_libc::fma::small_range_reduction;
|
||||
#else
|
||||
#include "range_reduction.h"
|
||||
// using namespace __llvm_libc::generic;
|
||||
using __llvm_libc::generic::FAST_PASS_BOUND;
|
||||
using __llvm_libc::generic::large_range_reduction;
|
||||
using __llvm_libc::generic::small_range_reduction;
|
||||
#endif
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
// Lookup table for tan(k * pi/32) with k = -15..15 organized as follow:
|
||||
// TAN_K_OVER_32[k] = tan(k * pi/32) for k = 0..15
|
||||
// TAN_K_OVER_32[k] = tan((k - 31) * pi/32) for k = 16..31.
|
||||
// This organization allows us to simply do the lookup:
|
||||
// TAN_K_OVER_32[k & 31] for k of type int(32/64) with 2-complement
|
||||
// representation.
|
||||
// The values of tan(k * pi/32) are generated by Sollya with:
|
||||
// for k from 0 -15 to 15 do { round(tan(k*pi/32), D, RN); };
|
||||
static constexpr double TAN_K_PI_OVER_32[32] = {
|
||||
0.0000000000000000, 0x1.936bb8c5b2da2p-4, 0x1.975f5e0553158p-3,
|
||||
0x1.36a08355c63dcp-2, 0x1.a827999fcef32p-2, 0x1.11ab7190834ecp-1,
|
||||
0x1.561b82ab7f99p-1, 0x1.a43002ae4285p-1, 0x1.0000000000000p0,
|
||||
0x1.37efd8d87607ep0, 0x1.7f218e25a7461p0, 0x1.def13b73c1406p0,
|
||||
0x1.3504f333f9de6p1, 0x1.a5f59e90600ddp1, 0x1.41bfee2424771p2,
|
||||
0x1.44e6c595afdccp3, -0x1.44e6c595afdccp3, -0x1.41bfee2424771p2,
|
||||
-0x1.a5f59e90600ddp1, -0x1.3504f333f9de6p1, -0x1.def13b73c1406p0,
|
||||
-0x1.7f218e25a7461p0, -0x1.37efd8d87607ep0, -0x1.0000000000000p0,
|
||||
-0x1.a43002ae4285p-1, -0x1.561b82ab7f99p-1, -0x1.11ab7190834ecp-1,
|
||||
-0x1.a827999fcef32p-2, -0x1.36a08355c63dcp-2, -0x1.975f5e0553158p-3,
|
||||
-0x1.936bb8c5b2da2p-4, 0.0000000000000000,
|
||||
};
|
||||
|
||||
// Exceptional cases for tanf.
|
||||
static constexpr size_t N_EXCEPTS = 6;
|
||||
constexpr size_t N_EXCEPTS = 6;
|
||||
|
||||
static constexpr fputil::ExceptValues<float, N_EXCEPTS> TANF_EXCEPTS{{
|
||||
constexpr fputil::ExceptValues<float, N_EXCEPTS> TANF_EXCEPTS{{
|
||||
// (inputs, RZ output, RU offset, RD offset, RN offset)
|
||||
// x = 0x1.3ae898p39, tan(x) = 0x1.23d43cp12 (RZ)
|
||||
{0x531d744c, 0x4591ea1e, 1, 0, 1},
|
||||
// x = 0x1.ada6aap27, tan(x) = 0x1.e80304p-3 (RZ)
|
||||
{0x4d56d355, 0x3e740182, 1, 0, 0},
|
||||
// x = 0x1.862064p33, tan(x) = -0x1.8dee56p-3 (RZ)
|
||||
{0x50431032, 0xbe46f72b, 0, 1, 1},
|
||||
// x = 0x1.af61dap48, tan(x) = 0x1.60d1c6p-2 (RZ)
|
||||
{0x57d7b0ed, 0x3eb068e3, 1, 0, 1},
|
||||
// x = 0x1.dd0d2ap76, tan(x) = -0x1.465f1cp22 (RZ)
|
||||
{0x65ee8695, 0xcaa32f8e, 0, 1, 0},
|
||||
// x = 0x1.31fc9ep80, tan(x) = 0x1.3c13eep13 (RZ)
|
||||
{0x6798fe4f, 0x461e09f7, 1, 0, 0},
|
||||
// x = 0x1.0088bcp52, tan(x) = 0x1.ca1edp0 (RZ)
|
||||
{0x5980445e, 0x3fe50f68, 1, 0, 0},
|
||||
// x = 0x1.f90dfcp72, tan(x) = 0x1.597f9cp-1 (RZ)
|
||||
{0x63fc86fe, 0x3f2cbfce, 1, 0, 0},
|
||||
// x = 0x1.a6ce12p86, tan(x) = -0x1.c5612ep-1 (RZ)
|
||||
{0x6ad36709, 0xbf62b097, 0, 1, 0},
|
||||
// x = 0x1.6a0b76p102, tan(x) = -0x1.e42a1ep0 (RZ)
|
||||
{0x72b505bb, 0xbff2150f, 0, 1, 0},
|
||||
}};
|
||||
|
||||
LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
|
||||
using FPBits = typename fputil::FPBits<float>;
|
||||
FPBits xbits(x);
|
||||
constexpr double SIGN[2] = {1.0, -1.0};
|
||||
double x_sign = SIGN[xbits.uintval() >> 31];
|
||||
|
||||
xbits.set_sign(false);
|
||||
uint32_t x_abs = xbits.uintval();
|
||||
|
||||
// Range reduction:
|
||||
//
|
||||
// Since tan(x) is an odd function,
|
||||
// tan(x) = -tan(-x),
|
||||
// By replacing x with -x if x is negative, we can assume in the following
|
||||
// that x is non-negative.
|
||||
//
|
||||
// We perform a range reduction mod pi/32, so that we ca have a good
|
||||
// polynomial approximation of tan(x) around [-pi/32, pi/32]. Since tan(x) is
|
||||
// periodic with period pi, in the first step of range reduction, we find k
|
||||
// and y such that:
|
||||
// x = (k + y) * pi/32,
|
||||
// where k is an integer, and |y| <= 0.5.
|
||||
// Moreover, we only care about the lowest 5 bits of k, since
|
||||
// tan((k + 32) * pi/32) = tan(k * pi/32 + pi) = tan(k * pi/32).
|
||||
// So after the reduction k = k & 31, we can assume that 0 <= k <= 31.
|
||||
//
|
||||
// For the second step, since tan(x) has a singularity at pi/2, we need a
|
||||
// further reduction so that:
|
||||
// k * pi/32 < pi/2, or equivalently, 0 <= k < 16.
|
||||
// So if k >= 16, we perform the following transformation:
|
||||
// tan(x) = tan(x - pi) = tan((k + y) * pi/32 - pi)
|
||||
// = tan((k - 31 + y - 1) * pi/32)
|
||||
// = tan((k - 31) * pi/32 + (y - 1) * pi/32)
|
||||
// = tan(k' * pi/32 + y' * pi/32)
|
||||
// Notice that we only subtract k by 31, not 32, to make sure that |k'| < 16.
|
||||
// In fact, the range of k' is: -15 <= k' <= 0.
|
||||
// But the range of y' is now: -1.5 <= y' <= -0.5.
|
||||
// If we perform round to zero in the first step of finding k and y, so that
|
||||
// 0 <= y <= 1, then the range of y' would be -1 <= y' <= 0, then we can
|
||||
// reduce the degree of polynomial approximation using to approximate
|
||||
// tan(y* pi/32) by 1 or 2 terms.
|
||||
// In any case, for simplicity and to reuse the same range reduction as sinf
|
||||
// and cosf, we opted to use the former range: [-1.5, 1.5] * pi/32 for
|
||||
// the polynomial approximation step.
|
||||
//
|
||||
// Once k and y are computed, we then deduce the answer by the tangent of sum
|
||||
// formula:
|
||||
// tan(x) = tan((k + y)*pi/32)
|
||||
// = (tan(y*pi/32) + tan(k*pi/32)) / (1 - tan(y*pi/32)*tan(k*pi/32))
|
||||
// The values of tan(k*pi/32) for k = -15..15 are precomputed and stored using
|
||||
// a vector of 31 doubles. Tan(y*pi/32) is computed using degree-9 minimax
|
||||
// polynomials generated by Sollya.
|
||||
bool x_sign = xbits.uintval() >> 31;
|
||||
uint32_t x_abs = xbits.uintval() & 0x7fff'ffffU;
|
||||
|
||||
// |x| < pi/32
|
||||
if (unlikely(x_abs <= 0x3dc9'0fdbU)) {
|
||||
|
@ -173,56 +91,45 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
|
|||
return xd * result;
|
||||
}
|
||||
|
||||
// Inf or NaN
|
||||
if (unlikely(x_abs >= 0x7f80'0000U)) {
|
||||
if (x_abs == 0x7f80'0000U) {
|
||||
errno = EDOM;
|
||||
fputil::set_except(FE_INVALID);
|
||||
}
|
||||
return x +
|
||||
FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
|
||||
// Check for exceptional values
|
||||
if (unlikely(x_abs == 0x3f8a1f62U)) {
|
||||
// |x| = 0x1.143ec4p0
|
||||
float sign = x_sign ? -1.0f : 1.0f;
|
||||
|
||||
return fputil::multiply_add(sign, 0x1.ddf9f4p0f, sign * 0x1.1p-24f);
|
||||
}
|
||||
|
||||
int64_t k;
|
||||
double y;
|
||||
double xd = static_cast<double>(xbits.get_val());
|
||||
|
||||
// Perform the first step of range reduction: find k and y such that
|
||||
// x = (k + y) * pi/32,
|
||||
// where k is an integer, and |y| <= 0.5.
|
||||
if (likely(x_abs < FAST_PASS_BOUND)) {
|
||||
k = small_range_reduction(xd, y);
|
||||
} else {
|
||||
|
||||
if (auto r = TANF_EXCEPTS.lookup_odd(x_abs, x_sign <= 0.0);
|
||||
// |x| > 0x1.ada6a8p+27f
|
||||
if (unlikely(x_abs > 0x4d56'd354U)) {
|
||||
// Inf or NaN
|
||||
if (unlikely(x_abs >= 0x7f80'0000U)) {
|
||||
if (x_abs == 0x7f80'0000U) {
|
||||
errno = EDOM;
|
||||
fputil::set_except(FE_INVALID);
|
||||
}
|
||||
return x +
|
||||
FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
|
||||
}
|
||||
// Other large exceptional values
|
||||
if (auto r = TANF_EXCEPTS.lookup_odd(x_abs, x_sign);
|
||||
unlikely(r.has_value()))
|
||||
return r.value();
|
||||
|
||||
fputil::FPBits<float> x_bits(x_abs);
|
||||
k = large_range_reduction(xd, x_bits.get_exponent(), y);
|
||||
}
|
||||
|
||||
// Only care about the lowest 5 bits of k.
|
||||
k &= 31;
|
||||
// Adjust y if k >= 16.
|
||||
constexpr double ADJUSTMENT[2] = {0.0, -1.0};
|
||||
y += ADJUSTMENT[k >> 4];
|
||||
// For |x| >= pi/32, we use the definition of tan(x) function:
|
||||
// tan(x) = sin(x) / cos(x)
|
||||
// The we follow the same computations of sin(x) and cos(x) as sinf, cosf,
|
||||
// and sincosf.
|
||||
|
||||
double tan_k = TAN_K_PI_OVER_32[k];
|
||||
double xd = static_cast<double>(x);
|
||||
double sin_k, cos_k, sin_y, cosm1_y;
|
||||
|
||||
// Degree-10 minimax odd polynomial for tan(y * pi/32)/y generated by Sollya
|
||||
// with:
|
||||
// > P = fpminimax(tan(y*pi/32)/y, [|0, 2, 4, 6, 8, 10|], [|D...|], [0, 1.5]);
|
||||
double ysq = y * y;
|
||||
double tan_y =
|
||||
y * fputil::polyeval(ysq, 0x1.921fb54442d17p-4, 0x1.4abbce625e84cp-12,
|
||||
0x1.466bc669afd51p-20, 0x1.460013a5aae3p-28,
|
||||
0x1.45de3dc438976p-36, 0x1.4eaeead85bef4p-44);
|
||||
|
||||
// Combine the results with the tangent of sum formula:
|
||||
// tan(x) = tan((k + y)*pi/32)
|
||||
// = (tan(k*pi/32) + tan(k*pi/32)) / (1 - tan(y*pi/32)*tan(k*pi/32))
|
||||
return x_sign * (tan_y + tan_k) / fputil::multiply_add(tan_y, -tan_k, 1.0);
|
||||
sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
|
||||
// tan(x) = sin(x) / cos(x)
|
||||
// = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
|
||||
using fputil::multiply_add;
|
||||
return multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
|
||||
multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k));
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
||||
|
|
|
@ -56,13 +56,14 @@ TEST(LlvmLibcTanfTest, InFloatRange) {
|
|||
}
|
||||
|
||||
TEST(LlvmLibcTanfTest, SpecificBitPatterns) {
|
||||
constexpr int N = 48;
|
||||
constexpr int N = 54;
|
||||
constexpr uint32_t INPUTS[N] = {
|
||||
0x3a7a'8d2fU, // x = 0x1.f51a5ep-11f
|
||||
0x3f06'0a92U, // x = pi/6
|
||||
0x3f3a'dc51U, // x = 0x1.75b8a2p-1f
|
||||
0x3f49'0fdbU, // x = pi/4
|
||||
0x3f86'0a92U, // x = pi/3
|
||||
0x3f8a'1f62U, // x = 0x1.143ec4p+0f
|
||||
0x3fa7'832aU, // x = 0x1.4f0654p+0f
|
||||
0x3fc9'0fdbU, // x = pi/2
|
||||
0x4017'1973U, // x = 0x1.2e32e6p+1f
|
||||
|
@ -75,14 +76,18 @@ TEST(LlvmLibcTanfTest, SpecificBitPatterns) {
|
|||
0x474d'246fU, // x = 0x1.9a48dep+15f
|
||||
0x4afd'ece4U, // x = 0x1.fbd9c8p+22f
|
||||
0x4c23'32e9U, // x = 0x1.4665d2p+25f
|
||||
0x4d56'd355U, // x = 0x1.ada6aap+27f
|
||||
0x5043'1032U, // x = 0x1.862064p+33f
|
||||
0x50a3'e87fU, // x = 0x1.47d0fep+34f
|
||||
0x5239'47f6U, // x = 0x1.728fecp+37f
|
||||
0x531d'744cU, // x = 0x1.3ae898p+39f
|
||||
0x53b1'46a6U, // x = 0x1.628d4cp+40f
|
||||
0x5532'5019U, // x = 0x1.64a032p+43f
|
||||
0x55ca'fb2aU, // x = 0x1.95f654p+44f
|
||||
0x57d7'b0edU, // x = 0x1.af61dap+48f
|
||||
0x588e'f060U, // x = 0x1.1de0cp+50f
|
||||
0x5922'aa80U, // x = 0x1.4555p+51f
|
||||
0x5980'445eU, // x = 0x1.0088bcp+52f
|
||||
0x5aa4'542cU, // x = 0x1.48a858p+54f
|
||||
0x5c07'bcd0U, // x = 0x1.0f79ap+57f
|
||||
0x5ebc'fddeU, // x = 0x1.79fbbcp+62f
|
||||
|
@ -91,6 +96,7 @@ TEST(LlvmLibcTanfTest, SpecificBitPatterns) {
|
|||
0x6115'cb11U, // x = 0x1.2b9622p+67f
|
||||
0x61a4'0b40U, // x = 0x1.48168p+68f
|
||||
0x6386'134eU, // x = 0x1.0c269cp+72f
|
||||
0x63fc'86feU, // x = 0x1.f90dfcp+72f
|
||||
0x6589'8498U, // x = 0x1.13093p+76f
|
||||
0x65ee'8695U, // x = 0x1.dd0d2ap+76f
|
||||
0x6600'0001U, // x = 0x1.000002p+77f
|
||||
|
|
Loading…
Reference in New Issue