[libc][math] fmod/fmodf implementation.

This is a implementation of find remainder fmod function from standard libm. The underline algorithm is developed by myself, but probably it was first invented before. Some features of the implementation: 1. The code is written on more-or-less modern C++. 2. One general implementation for both float and double precision numbers. 3. Spitted platform/architecture dependent and independent code and tests. 4. Tests covers 100% of the code for both float and double numbers. Tests cases with NaN/Inf etc is copied from glibc. 5. The new implementation in general 2-4 times faster for “regular” x,y values. It can be 20 times faster for x/y huge value, but can also be 2 times slower for double denormalized range (according to perf tests provided). 6. Two different implementation of division loop are provided. In some platforms division can be very time consuming operation. Depend on platform it can be 3-10 times slower than multiplication. Performance tests: The test is based on core-math project (https://gitlab.inria.fr/core-math/core-math). By Tue Ly suggestion I took hypot function and use it as template for fmod. Preserving all test cases. `./check.sh <--special|--worst> fmodf` passed. `CORE_MATH_PERF_MODE=rdtsc ./perf.sh fmodf` results are ``` GNU libc version: 2.35 GNU libc release: stable 21.166 <-- FPU 51.031 <-- current glibc 37.659 <-- this fmod version. ```
2022-06-06 10:10:24 +02:00 · 2022-06-06 10:10:24 +02:00 · b8e8012aa2
parent 5c29ffda90
commit b8e8012aa2
34 changed files with 1024 additions and 9 deletions
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@ -131,6 +131,8 @@ set(TARGET_LIBM_ENTRYPOINTS
    libc.src.math.fmin
    libc.src.math.fminf
    libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
    libc.src.math.frexp
    libc.src.math.frexpf
    libc.src.math.frexpl
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@ -150,6 +150,8 @@ set(TARGET_LIBM_ENTRYPOINTS
    libc.src.math.fmin
    libc.src.math.fminf
    libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
    libc.src.math.frexp
    libc.src.math.frexpf
    libc.src.math.frexpl
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@ -156,6 +156,8 @@ set(TARGET_LIBM_ENTRYPOINTS
    libc.src.math.fmax
    libc.src.math.fmaxf
    libc.src.math.fmaxl
+    libc.src.math.fmod
+    libc.src.math.fmodf
    libc.src.math.frexp
    libc.src.math.frexpf
    libc.src.math.frexpl
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@ -133,6 +133,8 @@ set(TARGET_LIBM_ENTRYPOINTS
    libc.src.math.fmax
    libc.src.math.fmaxf
    libc.src.math.fmaxl
+    libc.src.math.fmod
+    libc.src.math.fmodf
    libc.src.math.frexp
    libc.src.math.frexpf
    libc.src.math.frexpl
--- a/libc/docs/math.rst
+++ b/libc/docs/math.rst
@ -76,7 +76,7 @@ fdim           |check|          |check|         |check|
 floor          |check|          |check|         |check|  
 fmax           |check|          |check|         |check|  
 fmin           |check|          |check|         |check|  
-fmod
+fmod           |check|          |check|
 fpclassify
 frexp          |check|          |check|         |check|  
 ilogb          |check|          |check|         |check|  
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@ -378,6 +378,10 @@ def StdC : StandardSpec<"stdc"> {
          FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
          FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,

+          FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+
+          FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
+
          FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
          FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
          FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@ -13,6 +13,8 @@

 #include "src/__support/CPP/Bit.h"
 #include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/builtin_wrappers.h"
+#include "src/__support/common.h"

 #include "FloatProperties.h"
 #include <stdint.h>
@ -160,6 +162,33 @@ template <typename T> struct FPBits {
    bits.set_mantissa(v);
    return T(bits);
  }
+
+  // The function convert integer number and unbiased exponent to proper float
+  // T type:
+  //   Result = number * 2^(ep+1 - exponent_bias)
+  // Be careful!
+  //   1) "ep" is raw exponent value.
+  //   2) The function add to +1 to ep for seamless normalized to denormalized
+  //      transition.
+  //   3) The function did not check exponent high limit.
+  //   4) "number" zero value is not processed correctly.
+  //   5) Number is unsigned, so the result can be only positive.
+  inline static constexpr FPBits<T> make_value(UIntType number, int ep) {
+    FPBits<T> result;
+    // offset: +1 for sign, but -1 for implicit first bit
+    int lz = fputil::unsafe_clz(number) - FloatProp::EXPONENT_WIDTH;
+    number <<= lz;
+    ep -= lz;
+
+    if (likely(ep >= 0)) {
+      // Implicit number bit will be removed by mask
+      result.set_mantissa(number);
+      result.set_unbiased_exponent(ep + 1);
+    } else {
+      result.set_mantissa(number >> -ep);
+    }
+    return result;
+  }
 };

 } // namespace fputil
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@ -26,7 +26,7 @@ template <typename T>
 static inline T find_leading_one(T mant, int &shift_length) {
  shift_length = 0;
  if (mant > 0) {
-    shift_length = (sizeof(mant) * 8) - 1 - clz(mant);
+    shift_length = (sizeof(mant) * 8) - 1 - unsafe_clz(mant);
  }
  return T(1) << shift_length;
 }
--- a/libc/src/__support/FPUtil/builtin_wrappers.h
+++ b/libc/src/__support/FPUtil/builtin_wrappers.h
@ -17,6 +17,15 @@ namespace fputil {
 // __builtin_clz/ctz* rather than using the exactly-sized aliases from stdint.h.
 // This way, we can avoid making any assumptions about integer sizes and let the
 // compiler match for us.
+namespace __internal {
+
+template <typename T> static inline int correct_zero(T val, int bits) {
+  if (val == T(0))
+    return sizeof(T(0)) * 8;
+  else
+    return bits;
+}
+
 template <typename T> static inline int clz(T val);
 template <> inline int clz<unsigned int>(unsigned int val) {
  return __builtin_clz(val);
@ -38,6 +47,23 @@ template <> inline int ctz<unsigned long int>(unsigned long int val) {
 template <> inline int ctz<unsigned long long int>(unsigned long long int val) {
  return __builtin_ctzll(val);
 }
+} // namespace __internal
+
+template <typename T> static inline int safe_ctz(T val) {
+  return __internal::correct_zero(val, __internal::ctz(val));
+}
+
+template <typename T> static inline int unsafe_ctz(T val) {
+  return __internal::ctz(val);
+}
+
+template <typename T> static inline int safe_clz(T val) {
+  return __internal::correct_zero(val, __internal::clz(val));
+}
+
+template <typename T> static inline int unsafe_clz(T val) {
+  return __internal::clz(val);
+}

 template <typename T> static inline bool isnan(T val) {
  return __builtin_isnan(val);
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@ -14,3 +14,9 @@ add_header_library(
  DEPENDS
    libc.src.__support.CPP.uint128
 )
+
+add_header_library(
+  fmod
+  HDRS
+    FMod.h
+)
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@ -207,8 +207,9 @@ template <> inline double fma<double>(double x, double y, double z) {
  // Normalize the result.
  if (prod_mant != 0) {
    uint64_t prod_hi = static_cast<uint64_t>(prod_mant >> 64);
-    int lead_zeros =
-        prod_hi ? clz(prod_hi) : 64 + clz(static_cast<uint64_t>(prod_mant));
+    int lead_zeros = prod_hi
+                         ? unsafe_clz(prod_hi)
+                         : 64 + unsafe_clz(static_cast<uint64_t>(prod_mant));
    // Move the leading 1 to the most significant bit.
    prod_mant <<= lead_zeros;
    // The lower 64 bits are always sticky bits after moving the leading 1 to
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@ -0,0 +1,312 @@
+//===-- Common header for fmod implementations ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H
+
+#include "src/__support/CPP/Limits.h"
+#include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/builtin_wrappers.h"
+#include "src/__support/common.h"
+#include "src/math/generic/math_utils.h"
+
+namespace __llvm_libc {
+namespace fputil {
+namespace generic {
+
+//  Objective:
+//    The  algorithm uses  integer arithmetic  (max uint64_t)  for general case.
+//    Some common  cases, like  abs(x) < abs(y)  or  abs(x) < 1000 *  abs(y) are
+//    treated specially to increase  performance.  The part of checking  special
+//    cases, numbers NaN, INF etc. treated separately.
+//
+//  Objective:
+//    1) FMod definition (https://cplusplus.com/reference/cmath/fmod/):
+//       fmod = numer - tquot * denom, where tquot is the truncated
+//       (i.e., rounded towards zero) result of: numer/denom.
+//    2) FMod with negative x and/or y can be trivially converted to fmod for
+//       positive x and y. Therefore the algorithm below works only with
+//       positive numbers.
+//    3) All positive floating point numbers can be represented as m * 2^e,
+//       where "m" is positive integer and "e" is signed.
+//    4) FMod function can be calculated in integer numbers (x > y):
+//         fmod = m_x * 2^e_x - tquot * m_y * 2^e_y
+//              = 2^e_y * (m_x * 2^(e_x - e^y) - tquot * m_y).
+//       All variables in parentheses are unsigned integers.
+//
+//  Mathematical background:
+//    Input x,y in the algorithm is represented (mathematically) like m_x*2^e_x
+//    and m_y*2^e_y. This is an ambiguous number representation. For example:
+//      m * 2^e = (2 * m) * 2^(e-1)
+//    The algorithm uses the facts that
+//      r = a % b = (a % (N * b)) % b,
+//      (a * c) % (b * c) = (a % b) * c
+//    where N is positive  integer number. a, b and c - positive. Let's  adopt
+//    the formula for representation above.
+//      a = m_x * 2^e_x, b = m_y * 2^e_y, N = 2^k
+//      r(k) = a % b = (m_x * 2^e_x) % (2^k * m_y * 2^e_y)
+//           = 2^(e_y + k) * (m_x * 2^(e_x - e_y - k) % m_y)
+//      r(k) = m_r * 2^e_r = (m_x % m_y) * 2^(m_y + k)
+//           = (2^p * (m_x % m_y) * 2^(e_y + k - p))
+//        m_r = 2^p * (m_x % m_y), e_r = m_y + k - p
+//
+//  Algorithm description:
+//  First, let write x = m_x * 2^e_x and y = m_y * 2^e_y with m_x, m_y, e_x, e_y
+//  are integers (m_x amd m_y positive).
+//  Then the naive  implementation of the fmod function with a simple
+//  for/while loop:
+//      while (e_x > e_y) {
+//        m_x *= 2; --e_x; //  m_x * 2^e_x == 2 * m_x * 2^(e_x - 1)
+//        m_x %= m_y;
+//      }
+//  On the other hand, the algorithm exploits the fact that m_x, m_y are the
+//  mantissas of floating point numbers, which use less bits than the storage
+//  integers: 24 / 32 for floats and 53 / 64 for doubles, so if in each step of
+//  the iteration, we can left shift m_x as many bits as the storage integer
+//  type can hold, the exponent reduction per step will be at least 32 - 24 = 8
+//  for floats and 64 - 53 = 11 for doubles (double example below):
+//      while (e_x > e_y) {
+//        m_x <<= 11; e_x -= 11; //  m_x * 2^e_x == 2^11 * m_x * 2^(e_x - 11)
+//        m_x %= m_y;
+//      }
+//  Some extra improvements are done:
+//    1) Shift m_y maximum to the right, which can significantly improve
+//       performance for small integer numbers (y = 3 for example).
+//       The m_x shift in the loop can be 62 instead of 11 for double.
+//    2) For some architectures with very slow division, it can be better to
+//       calculate inverse value ones, and after do multiplication in the loop.
+//    3) "likely" special cases are treated specially to improve performance.
+//
+//  Simple example:
+//  The examples below use byte for simplicity.
+//    1) Shift hy maximum to right without losing bits and increase iy value
+//       m_y = 0b00101100 e_y = 20 after shift m_y = 0b00001011 e_y = 22.
+//    2) m_x = m_x % m_y.
+//    3) Move m_x maximum to left. Note that after (m_x = m_x % m_y) CLZ in m_x
+//    is not lower than CLZ in m_y. m_x=0b00001001 e_x = 100, m_x=0b10010000,
+//       e_x = 100-4 = 96.
+//    4) Repeat (2) until e_x == e_y.
+//
+//  Complexity analysis (double):
+//    Converting x,y to (m_x,e_x),(m_y, e_y): CTZ/shift/AND/OR/if. Loop  count:
+//      (m_x - m_y) / (64 -  "length of m_y").
+//      max("length of m_y")  = 53,
+//      max(e_x - e_y)  = 2048
+//    Maximum operation is  186. For rare "unrealistic" cases.
+//
+//  Special cases (double):
+//    Supposing  that  case  where |y| > 1e-292 and |x/y|<2000  is  very  common
+//    special processing is implemented. No m_y alignment, no loop:
+//      result = (m_x * 2^(e_x - e_y)) % m_y.
+//    When x and y are both subnormal (rare case but...) the
+//      result = m_x % m_y.
+//    Simplified conversion back to double.
+
+// Exceptional cases handler according to cppreference.com
+//    https://en.cppreference.com/w/cpp/numeric/math/fmod
+// and POSIX standard described in Linux man
+//   https://man7.org/linux/man-pages/man3/fmod.3p.html
+// C standard for the function is not full, so not by default (although it can
+// be implemented in another handler.
+template <typename T> struct FModExceptionalInputHandler {
+
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FModCStandardWrapper instantiated with invalid type.");
+
+  static bool PreCheck(T x, T y, T &out) {
+    if (likely(y != 0 && fputil::isfinite(y) && fputil::isfinite(x))) {
+      return false;
+    }
+
+    if (fputil::isnan(x) || fputil::isnan(y)) {
+      out = fputil::quiet_NaN(T(0));
+      return true;
+    }
+
+    if (fputil::isinf(x) || y == 0) {
+      fputil::set_except(FE_INVALID);
+      out = with_errno(fputil::quiet_NaN(T(0)), EDOM);
+      return true;
+    }
+
+    if (fputil::isinf(y)) {
+      out = x;
+      return true;
+    }
+
+    // case where x == 0
+    out = x;
+    return true;
+  }
+};
+
+template <typename T> struct FModFastMathWrapper {
+
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FModFastMathWrapper instantiated with invalid type.");
+
+  static bool PreCheck(T, T, T &) { return false; }
+};
+
+template <typename T> class FModDivisionSimpleHelper {
+private:
+  using intU_t = typename FPBits<T>::UIntType;
+
+public:
+  inline constexpr static intU_t execute(int exp_diff, int sides_zeroes_count,
+                                         intU_t m_x, intU_t m_y) {
+    while (exp_diff > sides_zeroes_count) {
+      exp_diff -= sides_zeroes_count;
+      m_x <<= sides_zeroes_count;
+      m_x %= m_y;
+    }
+    m_x <<= exp_diff;
+    m_x %= m_y;
+    return m_x;
+  }
+};
+
+template <typename T> class FModDivisionInvMultHelper {
+private:
+  using FPB = FPBits<T>;
+  using intU_t = typename FPB::UIntType;
+
+public:
+  inline constexpr static intU_t execute(int exp_diff, int sides_zeroes_count,
+                                         intU_t m_x, intU_t m_y) {
+    if (exp_diff > sides_zeroes_count) {
+      intU_t inv_hy = (cpp::NumericLimits<intU_t>::max() / m_y);
+      while (exp_diff > sides_zeroes_count) {
+        exp_diff -= sides_zeroes_count;
+        intU_t hd =
+            (m_x * inv_hy) >> (FPB::FloatProp::BIT_WIDTH - sides_zeroes_count);
+        m_x <<= sides_zeroes_count;
+        m_x -= hd * m_y;
+        while (unlikely(m_x > m_y))
+          m_x -= m_y;
+      }
+      intU_t hd = (m_x * inv_hy) >> (FPB::FloatProp::BIT_WIDTH - exp_diff);
+      m_x <<= exp_diff;
+      m_x -= hd * m_y;
+      while (unlikely(m_x > m_y))
+        m_x -= m_y;
+    } else {
+      m_x <<= exp_diff;
+      m_x %= m_y;
+    }
+    return m_x;
+  }
+};
+
+template <typename T, class Wrapper = FModExceptionalInputHandler<T>,
+          class DivisionHelper = FModDivisionSimpleHelper<T>>
+class FMod {
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FMod instantiated with invalid type.");
+
+private:
+  using FPB = FPBits<T>;
+  using intU_t = typename FPB::UIntType;
+
+  inline static constexpr FPB eval_internal(FPB sx, FPB sy) {
+
+    if (likely(sx.uintval() <= sy.uintval())) {
+      if (sx.uintval() < sy.uintval())
+        return sx;        // |x|<|y| return x
+      return FPB::zero(); // |x|=|y| return 0.0
+    }
+
+    int e_x = sx.get_unbiased_exponent();
+    int e_y = sy.get_unbiased_exponent();
+
+    // Most common case where |y| is "very normal" and |x/y| < 2^EXPONENT_WIDTH
+    if (likely(e_y > int(FPB::FloatProp::MANTISSA_WIDTH) &&
+               e_x - e_y <= int(FPB::FloatProp::EXPONENT_WIDTH))) {
+      intU_t m_x = sx.get_explicit_mantissa();
+      intU_t m_y = sy.get_explicit_mantissa();
+      intU_t d = (e_x == e_y) ? (m_x - m_y) : (m_x << (e_x - e_y)) % m_y;
+      if (d == 0)
+        return FPB::zero();
+      // iy - 1 because of "zero power" for number with power 1
+      return FPB::make_value(d, e_y - 1);
+    }
+    /* Both subnormal special case. */
+    if (unlikely(e_x == 0 && e_y == 0)) {
+      FPB d;
+      d.set_mantissa(sx.uintval() % sy.uintval());
+      return d;
+    }
+
+    // Note that hx is not subnormal by conditions above.
+    intU_t m_x = sx.get_explicit_mantissa();
+    e_x--;
+
+    intU_t m_y = sy.get_explicit_mantissa();
+    int lead_zeros_m_y = FPB::FloatProp::EXPONENT_WIDTH;
+    if (likely(e_y > 0)) {
+      e_y--;
+    } else {
+      m_y = sy.get_mantissa();
+      lead_zeros_m_y = unsafe_clz(m_y);
+    }
+
+    // Assume hy != 0
+    int tail_zeros_m_y = unsafe_ctz(m_y);
+    int sides_zeroes_count = lead_zeros_m_y + tail_zeros_m_y;
+    // n > 0 by conditions above
+    int exp_diff = e_x - e_y;
+    {
+      // Shift hy right until the end or n = 0
+      int right_shift = exp_diff < tail_zeros_m_y ? exp_diff : tail_zeros_m_y;
+      m_y >>= right_shift;
+      exp_diff -= right_shift;
+      e_y += right_shift;
+    }
+
+    {
+      // Shift hx left until the end or n = 0
+      int left_shift = exp_diff < int(FPB::FloatProp::EXPONENT_WIDTH)
+                           ? exp_diff
+                           : FPB::FloatProp::EXPONENT_WIDTH;
+      m_x <<= left_shift;
+      exp_diff -= left_shift;
+    }
+
+    m_x %= m_y;
+    if (unlikely(m_x == 0))
+      return FPB::zero();
+
+    if (exp_diff == 0)
+      return FPB::make_value(m_x, e_y);
+
+    /* hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0 */
+    m_x = DivisionHelper::execute(exp_diff, sides_zeroes_count, m_x, m_y);
+    return FPB::make_value(m_x, e_y);
+  }
+
+public:
+  static inline T eval(T x, T y) {
+    if (T out; Wrapper::PreCheck(x, y, out))
+      return out;
+    FPB sx(x), sy(y);
+    bool sign = sx.get_sign();
+    sx.set_sign(false);
+    sy.set_sign(false);
+    FPB result = eval_internal(sx, sy);
+    result.set_sign(sign);
+    return result.get_val();
+  }
+};
+
+} // namespace generic
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@ -36,8 +36,8 @@ template <> struct SpecialLongDouble<long double> {
 template <typename T>
 static inline void normalize(int &exponent,
                             typename FPBits<T>::UIntType &mantissa) {
-  const int shift =
-      clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
+  const int shift = unsafe_clz(mantissa) -
+                    (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
  exponent -= shift;
  mantissa <<= shift;
 }
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@ -21,7 +21,7 @@ namespace x86 {

 inline void normalize(int &exponent, UInt128 &mantissa) {
  const int shift =
-      clz(static_cast<uint64_t>(mantissa)) -
+      unsafe_clz(static_cast<uint64_t>(mantissa)) -
      (8 * sizeof(uint64_t) - 1 - MantissaWidth<long double>::VALUE);
  exponent -= shift;
  mantissa <<= shift;
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@ -52,11 +52,11 @@ template <class T> uint32_t inline leading_zeroes(T inputNumber) {
 }

 template <> uint32_t inline leading_zeroes<uint32_t>(uint32_t inputNumber) {
-  return inputNumber == 0 ? 32 : fputil::clz(inputNumber);
+  return fputil::safe_clz(inputNumber);
 }

 template <> uint32_t inline leading_zeroes<uint64_t>(uint64_t inputNumber) {
-  return inputNumber == 0 ? 64 : fputil::clz(inputNumber);
+  return fputil::safe_clz(inputNumber);
 }

 static inline uint64_t low64(const UInt128 &num) {
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@ -103,6 +103,9 @@ add_math_entrypoint_object(fmin)
 add_math_entrypoint_object(fminf)
 add_math_entrypoint_object(fminl)

+add_math_entrypoint_object(fmod)
+add_math_entrypoint_object(fmodf)
+
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
 add_math_entrypoint_object(frexpl)
--- a/libc/src/math/fmod.h
+++ b/libc/src/math/fmod.h
@ -0,0 +1,18 @@
+//===-- Implementation header for fmod --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMOD_H
+#define LLVM_LIBC_SRC_MATH_FMOD_H
+
+namespace __llvm_libc {
+
+double fmod(double x, double y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMOD_H
--- a/libc/src/math/fmodf.h
+++ b/libc/src/math/fmodf.h
@ -0,0 +1,18 @@
+//===-- Implementation header for fmodf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODF_H
+#define LLVM_LIBC_SRC_MATH_FMODF_H
+
+namespace __llvm_libc {
+
+float fmodf(float x, float y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMODF_H
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@ -1090,3 +1090,29 @@ add_object_library(
  COMPILE_OPTIONS
   -O3
 )
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  DEPENDS
+    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  DEPENDS
+    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
--- a/libc/src/math/generic/fmod.cpp
+++ b/libc/src/math/generic/fmod.cpp
@ -0,0 +1,19 @@
+//===-- Double-precision fmod function ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmod.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(double, fmod, (double x, double y)) {
+  return fputil::generic::FMod<double>::eval(x, y);
+}
+
+} // namespace __llvm_libc
--- a/libc/src/math/generic/fmodf.cpp
+++ b/libc/src/math/generic/fmodf.cpp
@ -0,0 +1,19 @@
+//===-- Single-precision fmodf function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
+  return fputil::generic::FMod<float>::eval(x, y);
+}
+
+} // namespace __llvm_libc
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@ -1271,6 +1271,34 @@ add_fp_unittest(
    libc.src.__support.FPUtil.fputil
 )

+add_fp_unittest(
+  fmodf_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmodf_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmodf
+    libc.src.__support.FPUtil.fputil
+)
+
+add_fp_unittest(
+  fmod_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmod_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmod
+    libc.src.__support.FPUtil.fputil
+)
+
 add_subdirectory(generic)
 add_subdirectory(exhaustive)
 add_subdirectory(differential_testing)
--- a/libc/test/src/math/FModTest.h
+++ b/libc/test/src/math/FModTest.h
@ -0,0 +1,270 @@
+//===-- Utility class to test fmod special numbers ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
+
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
+#include "utils/UnitTest/FPMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+#include <limits>
+#include <math.h>
+
+#define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
+  EXPECT_FP_EQ(expected, f(x, y));                                             \
+  EXPECT_MATH_ERRNO((dom_err) ? EDOM : 0);                                     \
+  EXPECT_FP_EXCEPTION(expected_exception);                                     \
+  __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT)
+
+#define TEST_REGULAR(x, y, expected) TEST_SPECIAL(x, y, expected, false, 0)
+
+template <typename T> class FmodTest : public __llvm_libc::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*FModFunc)(T, T);
+
+  void testSpecialNumbers(FModFunc f) {
+    using nl = std::numeric_limits<T>;
+
+    // fmod (+0, y) == +0 for y != 0.
+    TEST_SPECIAL(0.0, 3.0, 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::max(), 0.0, false, 0);
+
+    // fmod (-0, y) == -0 for y != 0.
+    TEST_SPECIAL(neg_zero, 3.0, neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::max(), neg_zero, false, 0);
+
+    // fmod (+inf, y) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(inf, 3.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, inf, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (-inf, y) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(neg_inf, 3.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, inf, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, +0) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(3.0, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(-1.1L, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(0.0, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_zero, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::denorm_min(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::min(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::max(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, -0) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(3.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(-1.1L, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(0.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_zero, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::denorm_min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::max(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, +inf) == x for x not infinite.
+    TEST_SPECIAL(0.0, inf, 0.0, false, 0);
+    TEST_SPECIAL(neg_zero, inf, neg_zero, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), inf, nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::min(), inf, nl::min(), false, 0);
+    TEST_SPECIAL(nl::max(), inf, nl::max(), false, 0);
+    TEST_SPECIAL(3.0, inf, 3.0, false, 0);
+    // fmod (x, -inf) == x for x not infinite.
+    TEST_SPECIAL(0.0, neg_inf, 0.0, false, 0);
+    TEST_SPECIAL(neg_zero, neg_inf, neg_zero, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), neg_inf, nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::min(), neg_inf, nl::min(), false, 0);
+    TEST_SPECIAL(nl::max(), neg_inf, nl::max(), false, 0);
+    TEST_SPECIAL(3.0, neg_inf, 3.0, false, 0);
+
+    TEST_SPECIAL(0.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(0.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_zero, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_zero, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(1.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(1.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(0.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(0.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(neg_zero, nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(neg_zero, -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(1.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(1.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(inf, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(inf, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(neg_inf, -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+
+    TEST_SPECIAL(6.5, 2.25L, 2.0L, false, 0);
+    TEST_SPECIAL(-6.5, 2.25L, -2.0L, false, 0);
+    TEST_SPECIAL(6.5, -2.25L, 2.0L, false, 0);
+    TEST_SPECIAL(-6.5, -2.25L, -2.0L, false, 0);
+
+    TEST_SPECIAL(nl::max(), nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::denorm_min(), neg_zero, false, 0);
+
+    TEST_SPECIAL(nl::min(), nl::max(), nl::min(), false, 0);
+    TEST_SPECIAL(nl::min(), -nl::max(), nl::min(), false, 0);
+    TEST_SPECIAL(nl::min(), nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::min(), nl::max(), -nl::min(), false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::max(), -nl::min(), false, 0);
+    TEST_SPECIAL(-nl::min(), nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::denorm_min(), neg_zero, false, 0);
+
+    TEST_SPECIAL(nl::denorm_min(), nl::max(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::max(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), nl::min(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::min(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::max(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::max(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::min(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::min(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::denorm_min(), neg_zero, false, 0);
+  }
+
+  void testRegularExtreme(FModFunc f) {
+
+    TEST_REGULAR(0x1p127L, 0x3p-149L, 0x1p-149L);
+    TEST_REGULAR(0x1p127L, -0x3p-149L, 0x1p-149L);
+    TEST_REGULAR(0x1p127L, 0x3p-148L, 0x1p-147L);
+    TEST_REGULAR(0x1p127L, -0x3p-148L, 0x1p-147L);
+    TEST_REGULAR(0x1p127L, 0x3p-126L, 0x1p-125L);
+    TEST_REGULAR(0x1p127L, -0x3p-126L, 0x1p-125L);
+    TEST_REGULAR(-0x1p127L, 0x3p-149L, -0x1p-149L);
+    TEST_REGULAR(-0x1p127L, -0x3p-149L, -0x1p-149L);
+    TEST_REGULAR(-0x1p127L, 0x3p-148L, -0x1p-147L);
+    TEST_REGULAR(-0x1p127L, -0x3p-148L, -0x1p-147L);
+    TEST_REGULAR(-0x1p127L, 0x3p-126L, -0x1p-125L);
+    TEST_REGULAR(-0x1p127L, -0x3p-126L, -0x1p-125L);
+
+    if constexpr (sizeof(T) >= sizeof(double)) {
+      TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L);
+    }
+  }
+};
+
+#define LIST_FMOD_TESTS(T, func)                                               \
+  using LlvmLibcFmodTest = FmodTest<T>;                                        \
+  TEST_F(LlvmLibcFmodTest, SpecialNumbers) { testSpecialNumbers(&func); }      \
+  TEST_F(LlvmLibcFmodTest, RegularExtreme) { testRegularExtreme(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@ -470,3 +470,43 @@ add_diff_binary(
  COMPILE_OPTIONS
    -fno-builtin
 )
+
+add_diff_binary(
+  fmodf_diff
+  SRCS
+    fmodf_diff.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodf
+)
+
+add_diff_binary(
+  fmodf_perf
+  SRCS
+    fmodf_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodf
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_diff_binary(
+  fmod_diff
+  SRCS
+    fmod_diff.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmod
+)
+
+add_diff_binary(
+  fmod_perf
+  SRCS
+    fmod_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmod
+  COMPILE_OPTIONS
+    -fno-builtin
+)
--- a/libc/test/src/math/differential_testing/fmod_diff.cpp
+++ b/libc/test/src/math/differential_testing/fmod_diff.cpp
@ -0,0 +1,15 @@
+//===-- Differential test for fmod ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmod.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_DIFF(double, __llvm_libc::fmod, ::fmod, "fmod_diff.log")
--- a/libc/test/src/math/differential_testing/fmod_perf.cpp
+++ b/libc/test/src/math/differential_testing/fmod_perf.cpp
@ -0,0 +1,15 @@
+//===-- Differential test for fmod ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmod.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(double, __llvm_libc::fmod, ::fmod, "fmod_perf.log")
--- a/libc/test/src/math/differential_testing/fmodf_diff.cpp
+++ b/libc/test/src/math/differential_testing/fmodf_diff.cpp
@ -0,0 +1,16 @@
+//===-- Differential test for fmodf ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_DIFF(float, __llvm_libc::fmodf, ::fmodf,
+                             "fmodf_diff.log")
--- a/libc/test/src/math/differential_testing/fmodf_perf.cpp
+++ b/libc/test/src/math/differential_testing/fmodf_perf.cpp
@ -0,0 +1,16 @@
+//===-- Differential test for fmodf ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(float, __llvm_libc::fmodf, ::fmodf,
+                             "fmodf_perf.log")
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@ -184,3 +184,16 @@ add_fp_unittest(
  LINK_LIBRARIES
    -lpthread
 )
+
+add_fp_unittest(
+  fmod_generic_impl_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    fmod_generic_impl_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.generic.fmod
+)
--- a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@ -0,0 +1,78 @@
+//===-- Utility class to test FMod generic implementation -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/FPMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+#include <array>
+#include <limits>
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+template <typename T, bool InverseMultiplication>
+class LlvmLibcFModTest : public __llvm_libc::testing::Test {
+
+  using DivisionHelper = __llvm_libc::cpp::ConditionalType<
+      InverseMultiplication,
+      __llvm_libc::fputil::generic::FModDivisionInvMultHelper<T>,
+      __llvm_libc::fputil::generic::FModDivisionSimpleHelper<T>>;
+
+  static constexpr std::array<T, 11> test_bases = {
+      T(0.0),
+      T(1.0),
+      T(3.0),
+      T(27.0),
+      T(11.0 / 8.0),
+      T(2.764443),
+      T(1.0) - std::numeric_limits<T>::epsilon(),
+      T(1.0) + std::numeric_limits<T>::epsilon(),
+      T(M_PI),
+      T(M_SQRT2),
+      T(M_E)};
+
+public:
+  void testExtensive() {
+    using FMod = __llvm_libc::fputil::generic::FMod<
+        T, __llvm_libc::fputil::generic::FModFastMathWrapper<T>,
+        DivisionHelper>;
+    using nl = std::numeric_limits<T>;
+    int min2 = nl::min_exponent - nl::digits - 5;
+    int max2 = nl::max_exponent + 3;
+    for (T by : test_bases) {
+      for (int iy = min2; iy < max2; iy++) {
+        T y = by * std::ldexp(2, iy);
+        if (y == 0 || !std::isfinite(y))
+          continue;
+        for (T bx : test_bases) {
+          for (int ix = min2; ix < max2; ix++) {
+            T x = bx * std::ldexp(2, ix);
+            if (!std::isfinite(x))
+              continue;
+            T result = FMod::eval(x, y);
+            mpfr::BinaryInput<T> input{x, y};
+            EXPECT_MPFR_MATCH(mpfr::Operation::Fmod, input, result, 0.0);
+          }
+        }
+      }
+    }
+  }
+};
+
+using LlvmLibcFModFloatTest = LlvmLibcFModTest<float, false>;
+TEST_F(LlvmLibcFModFloatTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModFloatInvTest = LlvmLibcFModTest<float, true>;
+TEST_F(LlvmLibcFModFloatInvTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModDoubleTest = LlvmLibcFModTest<double, false>;
+TEST_F(LlvmLibcFModDoubleTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModDoubleInvTest = LlvmLibcFModTest<double, true>;
+TEST_F(LlvmLibcFModDoubleInvTest, ExtensiveTest) { testExtensive(); }
--- a/libc/test/src/math/fmod_test.cpp
+++ b/libc/test/src/math/fmod_test.cpp
@ -0,0 +1,13 @@
+//===-- Unittests for fmod ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmod.h"
+
+LIST_FMOD_TESTS(double, __llvm_libc::fmod)
--- a/libc/test/src/math/fmodf_test.cpp
+++ b/libc/test/src/math/fmodf_test.cpp
@ -0,0 +1,13 @@
+//===-- Unittests for fmodf -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodf.h"
+
+LIST_FMOD_TESTS(float, __llvm_libc::fmodf)
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@ -247,6 +247,12 @@ public:
    return result;
  }

+  MPFRNumber fmod(const MPFRNumber &b) {
+    MPFRNumber result(*this);
+    mpfr_fmod(result.value, value, b.value, mpfr_rounding);
+    return result;
+  }
+
  MPFRNumber frexp(int &exp) {
    MPFRNumber result(*this);
    mpfr_exp_t resultExp;
@ -561,6 +567,8 @@ binary_operation_one_output(Operation op, InputType x, InputType y,
  MPFRNumber inputX(x, precision, rounding);
  MPFRNumber inputY(y, precision, rounding);
  switch (op) {
+  case Operation::Fmod:
+    return inputX.fmod(inputY);
  case Operation::Hypot:
    return inputX.hypot(inputY);
  default:
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@ -56,6 +56,7 @@ enum class Operation : int {
  // input and produce a single floating point number of the same type as
  // output.
  BeginBinaryOperationsSingleOutput,
+  Fmod,
  Hypot,
  EndBinaryOperationsSingleOutput,