[libc] Make expm1f correctly rounded when the targets have no FMA instructions.

Add another exceptional value and fix the case when |x| is small. Performance tests with CORE-MATH project scripts: With FMA instructions on Ryzen 1700: ``` $ ./perf.sh expm1f LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a CORE-MATH reciprocal throughput : 15.362 System LIBC reciprocal throughput : 53.194 LIBC reciprocal throughput : 14.595 $ ./perf.sh expm1f --latency LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a CORE-MATH latency : 57.755 System LIBC latency : 147.020 LIBC latency : 60.269 ``` Without FMA instructions: ``` $ ./perf.sh expm1f LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a CORE-MATH reciprocal throughput : 15.362 System LIBC reciprocal throughput : 53.300 LIBC reciprocal throughput : 18.020 $ ./perf.sh expm1f --latency LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a CORE-MATH latency : 57.758 System LIBC latency : 147.025 LIBC latency : 70.304 ``` Reviewed By: michaelrj Differential Revision: https://reviews.llvm.org/D123440
2022-04-09 00:12:50 -04:00 · 2022-04-09 00:12:50 -04:00 · 484319f497
parent 4fc502368a
commit 484319f497
5 changed files with 52 additions and 28 deletions
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@ -34,6 +34,15 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
    return 0x1.8dbe62p-3f;
  }

+#if !defined(LIBC_TARGET_HAS_FMA)
+  if (unlikely(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
+    int round_mode = fputil::get_round();
+    if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
+      return -0x1.71c884p-4f;
+    return -0x1.71c882p-4f;
+  }
+#endif // LIBC_TARGET_HAS_FMA
+
  // When |x| > 25*log(2), or nan
  if (unlikely(x_abs >= 0x418a'a123U)) {
    // x < log(2^-25)
@ -70,19 +79,30 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
      // x = -0.0f
      if (unlikely(xbits.uintval() == 0x8000'0000U))
        return x;
-      // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
-      // is:
-      //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
-      //                               = |x|
-      //                               < 2^-25
-      //                               < epsilon(1)/2.
-      // So the correctly rounded values of expm1(x) are:
-      //   = x + eps(x) if rounding mode = FE_UPWARD,
-      //                   or (rounding mode = FE_TOWARDZERO and x is negative),
-      //   = x otherwise.
-      // To simplify the rounding decision and make it more efficient, we use
-      //   fma(x, x, x) ~ x + x^2 instead.
-      return fputil::multiply_add(x, x, x);
+        // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
+        // is:
+        //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
+        //                               = |x|
+        //                               < 2^-25
+        //                               < epsilon(1)/2.
+        // So the correctly rounded values of expm1(x) are:
+        //   = x + eps(x) if rounding mode = FE_UPWARD,
+        //                   or (rounding mode = FE_TOWARDZERO and x is
+        //                   negative),
+        //   = x otherwise.
+        // To simplify the rounding decision and make it more efficient, we use
+        //   fma(x, x, x) ~ x + x^2 instead.
+        // Note: to use the formula x + x^2 to decide the correct rounding, we
+        // do need fma(x, x, x) to prevent underflow caused by x*x when |x| <
+        // 2^-76. For targets without FMA instructions, we simply use double for
+        // intermediate results as it is more efficient than using an emulated
+        // version of FMA.
+#if defined(LIBC_TARGET_HAS_FMA)
+      return fputil::fma(x, x, x);
+#else
+      double xd = x;
+      return static_cast<float>(fputil::multiply_add(xd, xd, xd));
+#endif // LIBC_TARGET_HAS_FMA
    }

    // 2^-25 <= |x| < 2^-4
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@ -1189,9 +1189,6 @@ add_fp_unittest(
    libc.src.__support.FPUtil.fputil
 )

-# Without FMA instructions, the current expm1f implementation is not correctly
-# rounded for all float inputs (1 extra exceptional value). This will be fixed
-# in the followup patch: https://reviews.llvm.org/D123440
 add_fp_unittest(
  expm1f_test
  NEED_MPFR
@ -1204,8 +1201,6 @@ add_fp_unittest(
    libc.include.math
    libc.src.math.expm1f
    libc.src.__support.FPUtil.fputil
-  FLAGS
-    FMA_OPT__ONLY
 )

 add_fp_unittest(
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@ -92,7 +92,6 @@ add_fp_unittest(
  DEPENDS
    .exhaustive_test
    libc.include.math
-    libc.src.math.expf
    libc.src.math.expm1f
    libc.src.__support.FPUtil.fputil
  LINK_LIBRARIES
--- a/libc/test/src/math/exhaustive/expm1f_test.cpp
+++ b/libc/test/src/math/exhaustive/expm1f_test.cpp
@ -18,7 +18,7 @@ using FPBits = __llvm_libc::fputil::FPBits<float>;

 namespace mpfr = __llvm_libc::testing::mpfr;

-struct LlvmLibcExpfExhaustiveTest : public LlvmLibcExhaustiveTest<uint32_t> {
+struct LlvmLibcExpm1fExhaustiveTest : public LlvmLibcExhaustiveTest<uint32_t> {
  bool check(uint32_t start, uint32_t stop,
             mpfr::RoundingMode rounding) override {
    mpfr::ForceRoundingMode r(rounding);
@ -40,21 +40,21 @@ static const int NUM_THREADS = std::thread::hardware_concurrency();
 static constexpr uint32_t POS_START = 0x0000'0000U;
 static constexpr uint32_t POS_STOP = 0x42b2'0000U;

-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundNearestTieToEven) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundNearestTieToEven) {
  test_full_range(POS_START, POS_STOP, NUM_THREADS,
                  mpfr::RoundingMode::Nearest);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundUp) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundUp) {
  test_full_range(POS_START, POS_STOP, NUM_THREADS, mpfr::RoundingMode::Upward);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundDown) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundDown) {
  test_full_range(POS_START, POS_STOP, NUM_THREADS,
                  mpfr::RoundingMode::Downward);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundTowardZero) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundTowardZero) {
  test_full_range(POS_START, POS_STOP, NUM_THREADS,
                  mpfr::RoundingMode::TowardZero);
 }
@ -63,21 +63,21 @@ TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundTowardZero) {
 static constexpr uint32_t NEG_START = 0x8000'0000U;
 static constexpr uint32_t NEG_STOP = 0xc2d0'0000U;

-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundNearestTieToEven) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundNearestTieToEven) {
  test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                  mpfr::RoundingMode::Nearest);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundUp) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundUp) {
  test_full_range(NEG_START, NEG_STOP, NUM_THREADS, mpfr::RoundingMode::Upward);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundDown) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundDown) {
  test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                  mpfr::RoundingMode::Downward);
 }

-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundTowardZero) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundTowardZero) {
  test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                  mpfr::RoundingMode::TowardZero);
 }
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@ -97,6 +97,16 @@ TEST(LlvmLibcExpm1fTest, Borderline) {
  ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                 __llvm_libc::expm1f(x), 0.5);
  EXPECT_MATH_ERRNO(0);
+
+  x = float(FPBits(0x942ed494U));
+  ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                 __llvm_libc::expm1f(x), 0.5);
+  EXPECT_MATH_ERRNO(0);
+
+  x = float(FPBits(0xbdc1c6cbU));
+  ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                 __llvm_libc::expm1f(x), 0.5);
+  EXPECT_MATH_ERRNO(0);
 }

 TEST(LlvmLibcExpm1fTest, InFloatRange) {