[libc++][format] Implement Unicode support.

This adds the width estimation functions to the std-format-spec. Implements parts of: - P0645 Text Formatting - P1868 width: clarifying units of width and precision in std::format Reviewed By: #libc, ldionne, vitaut Differential Revision: https://reviews.llvm.org/D103413
2021-05-25 20:32:38 +02:00 · 2021-05-25 20:32:38 +02:00 · ac7031b2b2
parent f33274c7bf
commit ac7031b2b2
5 changed files with 1041 additions and 1 deletions
--- a/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
+++ b/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
@ -0,0 +1,196 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_HAS_NO_UNICODE
+
+#include <array>
+#include <format>
+
+#include "benchmark/benchmark.h"
+
+#include "test_macros.h"
+
+template <class CharT, size_t N>
+class tester {
+  static constexpr size_t size_ = N - 1;
+  std::array<CharT, 100 * size_> data_;
+
+public:
+  explicit constexpr tester(const CharT (&input)[N]) {
+    auto it = data_.begin();
+    for (int i = 0; i < 100; ++i)
+      it = std::copy_n(input, size_, it);
+  }
+
+  constexpr size_t size() const noexcept { return data_.size(); }
+  constexpr const CharT* begin() const noexcept { return data_.begin(); }
+  constexpr const CharT* end() const noexcept { return data_.end(); }
+
+  void test(benchmark::State& state) const {
+    for (auto _ : state)
+      benchmark::DoNotOptimize(std::__format_spec::__get_string_alignment(
+          begin(), end(), 1'000'000, 1'000'000));
+    state.SetItemsProcessed(state.iterations() * size());
+  }
+};
+
+#define TEST(u8)                                                               \
+  if constexpr (std::same_as<CharT, char>) {                                   \
+    constexpr auto p = tester{u8};                                             \
+    p.test(state);                                                             \
+  } else if constexpr (std::same_as<CharT, char16_t>) {                        \
+    constexpr auto p = tester{TEST_CONCAT(u, u8)};                             \
+    p.test(state);                                                             \
+  } else {                                                                     \
+    constexpr auto p = tester{TEST_CONCAT(U, u8)};                             \
+    p.test(state);                                                             \
+  }
+
+template <class CharT>
+static void BM_EstimateLengthNoMultiByte(benchmark::State& state) {
+  TEST("The quick brown fox jumps over the lazy dog");
+}
+
+template <class CharT>
+static void BM_EstimateLengthTwoByteDE(benchmark::State& state) {
+  static_assert(sizeof("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == 67);
+
+  // https://en.wikipedia.org/wiki/Pangram
+  TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich");
+}
+
+template <class CharT>
+static void BM_EstimateLengthTwoBytePL(benchmark::State& state) {
+  static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53);
+
+  // https://en.wikipedia.org/wiki/Pangram
+  TEST("Stróż pchnął kość w quiz gędźb vel fax myjń");
+}
+
+// All values below are 1100, which is is the first multi column sequence.
+template <class CharT>
+static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) {
+  static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
+                       "\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
+                49);
+
+  TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
+       "\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
+}
+
+template <class CharT>
+static void
+BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) {
+  static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
+                       "\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") ==
+                49);
+
+  TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
+       "\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f");
+}
+
+template <class CharT>
+static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) {
+  static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
+                       "\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
+                49);
+
+  TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
+       "\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
+}
+
+template <class CharT>
+static void BM_EstimateLengthThreeByte(benchmark::State& state) {
+  static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
+                       "\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") ==
+                49);
+
+  TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
+       "\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e");
+}
+
+template <class CharT>
+static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) {
+  static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
+                       "\U00010004\U00010005\U00010006\U00010007"
+                       "\U00010008\U00010009\U0001000a\U0001000b"
+                       "\U0001000c\U0001000d\U0001000e\U0001000f") == 65);
+
+  TEST("\U00010000\U00010001\U00010002\U00010003"
+       "\U00010004\U00010005\U00010006\U00010007"
+       "\U00010008\U00010009\U0001000a\U0001000b"
+       "\U0001000c\U0001000d\U0001000e\U0001000f");
+}
+
+template <class CharT>
+static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) {
+  static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003"
+                       "\U00020004\U00020005\U00020006\U00020007"
+                       "\U00020008\U00020009\U0002000a\U0002000b"
+                       "\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
+
+  TEST("\U00020000\U00020002\U00020002\U00020003"
+       "\U00020004\U00020005\U00020006\U00020007"
+       "\U00020008\U00020009\U0002000a\U0002000b"
+       "\U0002000c\U0002000d\U0002000e\U0002000f");
+}
+
+template <class CharT>
+static void BM_EstimateLengthFourByte(benchmark::State& state) {
+  static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
+                       "\U00020004\U00020005\U00020006\U00020007"
+                       "\U00010008\U00010009\U0001000a\U0001000b"
+                       "\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
+
+  TEST("\U00010000\U00010001\U00010002\U00010003"
+       "\U00020004\U00020005\U00020006\U00020007"
+       "\U00010008\U00010009\U0001000a\U0001000b"
+       "\U0002000c\U0002000d\U0002000e\U0002000f");
+}
+
+BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char);
+
+BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t);
+
+BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t);
+BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  benchmark::RunSpecifiedBenchmarks();
+}
+#else
+int main(int, char**) { return 0; }
+#endif
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@ -171,7 +171,7 @@
 "`P1460 <https://wg21.link/P1460>`__","LWG","Mandating the Standard Library: Clause 20 - Utilities library","Prague","* *",""
 "`P1739 <https://wg21.link/P1739>`__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","* *",""
 "`P1831 <https://wg21.link/P1831>`__","LWG","Deprecating volatile: library","Prague","* *",""
-"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","* *",""
+"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","|In Progress|",""
 "`P1908 <https://wg21.link/P1908>`__","CWG","Reserving Attribute Namespaces for Future Use","Prague","* *",""
 "`P1937 <https://wg21.link/P1937>`__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *",""
 "`P1956 <https://wg21.link/P1956>`__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0"
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@ -10,12 +10,15 @@
 #ifndef _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
 #define _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H

+#include <__algorithm/find_if.h>
+#include <__algorithm/min.h>
 #include <__config>
 #include <__debug>
 #include <__format/format_arg.h>
 #include <__format/format_error.h>
 #include <__format/format_string.h>
 #include <__variant/monostate.h>
+#include <bit>
 #include <concepts>
 #include <cstdint>
 #include <type_traits>
@ -24,6 +27,9 @@
 # pragma GCC system_header
 #endif

+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD

 #if _LIBCPP_STD_VER > 17
@ -711,6 +717,462 @@ protected:
 // TODO FMT Add a parser for floating-point values.
 // TODO FMT Add a parser for pointer values.

+/** Helper struct returned from @ref __get_string_alignment. */
+template <class _CharT>
+struct _LIBCPP_TEMPLATE_VIS __string_alignment {
+  /** Points beyond the last character to write to the output. */
+  const _CharT* __last;
+  /**
+   * The estimated number of columns in the output or 0.
+   *
+   * Only when the output needs to be aligned it's required to know the exact
+   * number of columns in the output. So if the formatted output has only a
+   * minimum width the exact size isn't important. It's only important to know
+   * the minimum has been reached. The minimum width is the width specified in
+   * the format-spec.
+   *
+   * For example in this code @code std::format("{:10}", MyString); @endcode
+   * the width estimation can stop once the algorithm has determined the output
+   * width is 10 columns.
+   *
+   * So if:
+   * * @ref __align == @c true the @ref __size is the estimated number of
+   *   columns required.
+   * * @ref __align == @c false the @ref __size is the estimated number of
+   *   columns required or 0 when the estimation algorithm stopped prematurely.
+   */
+  ptrdiff_t __size;
+  /**
+   * Does the output need to be aligned.
+   *
+   * When alignment is needed the output algorithm needs to add the proper
+   * padding. Else the output algorithm just needs to copy the input up to
+   * @ref __last.
+   */
+  bool __align;
+};
+
+#ifndef _LIBCPP_HAS_NO_UNICODE
+namespace __detail {
+
+/**
+ * Unicode column width estimates.
+ *
+ * Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
+ * Depending on format the relation between the number of code units stored and
+ * the number of output columns differs. The first relation is the number of
+ * code units forming a code point. (The text assumes the code units are
+ * unsigned.)
+ * - UTF-8 The number of code units is between one and four. The first 127
+ *   Unicode code points match the ASCII character set. When the highest bit is
+ *   set it means the code point has more than one code unit.
+ * - UTF-16: The number of code units is between 1 and 2. When the first
+ *   code unit is in the range [0xd800,0xdfff) it means the code point uses two
+ *   code units.
+ * - UTF-32: The number of code units is always one.
+ *
+ * The code point to the number of columns isn't well defined. The code uses the
+ * estimations defined in [format.string.std]/11. This list might change in the
+ * future.
+ *
+ * The algorithm of @ref __get_string_alignment uses two different scanners:
+ * - The simple scanner @ref __estimate_column_width_fast. This scanner assumes
+ *   1 code unit is 1 column. This scanner stops when it can't be sure the
+ *   assumption is valid:
+ *   - UTF-8 when the code point is encoded in more than 1 code unit.
+ *   - UTF-16 and UTF-32 when the first multi-column code point is encountered.
+ *     (The code unit's value is lower than 0xd800 so the 2 code unit encoding
+ *     is irrelevant for this scanner.)
+ *   Due to these assumptions the scanner is faster than the full scanner. It
+ *   can process all text only containing ASCII. For UTF-16/32 it can process
+ *   most (all?) European languages. (Note the set it can process might be
+ *   reduced in the future, due to updates in the scanning rules.)
+ * - The full scanner @ref __estimate_column_width. This scanner, if needed,
+ *   converts multiple code units into one code point then converts the code
+ *   point to a column width.
+ *
+ * See also:
+ * - [format.string.general]/11
+ * - https://en.wikipedia.org/wiki/UTF-8#Encoding
+ * - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
+ */
+
+/**
+ * The first 2 column code point.
+ *
+ * This is the point where the fast UTF-16/32 scanner needs to stop processing.
+ */
+inline constexpr uint32_t __two_column_code_point = 0x1100;
+
+/** Helper concept for an UTF-8 character type. */
+template <class _CharT>
+concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>;
+
+/** Helper concept for an UTF-16 character type. */
+template <class _CharT>
+concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>;
+
+/** Helper concept for an UTF-32 character type. */
+template <class _CharT>
+concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>;
+
+/** Helper concept for an UTF-16 or UTF-32 character type. */
+template <class _CharT>
+concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>;
+
+/**
+ * Converts a code point to the column width.
+ *
+ * The estimations are conforming to [format.string.general]/11
+ *
+ * This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
+ * character.
+ */
+_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept {
+  _LIBCPP_ASSERT(__c < 0x1'0000,
+                 "Use __column_width_4 or __column_width for larger values");
+
+  // clang-format off
+  return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
+             (__c >= 0x2329 && (__c <= 0x232a ||
+             (__c >= 0x2e80 && (__c <= 0x303e ||
+             (__c >= 0x3040 && (__c <= 0xa4cf ||
+             (__c >= 0xac00 && (__c <= 0xd7a3 ||
+             (__c >= 0xf900 && (__c <= 0xfaff ||
+             (__c >= 0xfe10 && (__c <= 0xfe19 ||
+             (__c >= 0xfe30 && (__c <= 0xfe6f ||
+             (__c >= 0xff00 && (__c <= 0xff60 ||
+             (__c >= 0xffe0 && (__c <= 0xffe6
+             ))))))))))))))))))));
+  // clang-format on
+}
+
+/**
+ * @overload
+ *
+ * This version expects a value greater than or equal to 0x1'0000, which is a
+ * 4-byte UTF-8 character.
+ */
+_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept {
+  _LIBCPP_ASSERT(__c >= 0x1'0000,
+                 "Use __column_width_3 or __column_width for smaller values");
+
+  // clang-format off
+  return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
+             (__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
+             (__c >= 0x2'0000 && (__c <= 0x2'fffd ||
+             (__c >= 0x3'0000 && (__c <= 0x3'fffd
+             ))))))));
+  // clang-format on
+}
+
+/**
+ * @overload
+ *
+ * The general case, accepting all values.
+ */
+_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept {
+  if (__c < 0x1'0000)
+    return __column_width_3(__c);
+
+  return __column_width_4(__c);
+}
+
+/**
+ * Estimate the column width for the UTF-8 sequence using the fast algorithm.
+ */
+template <__utf8_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
+__estimate_column_width_fast(const _CharT* __first,
+                             const _CharT* __last) noexcept {
+  return _VSTD::find_if(__first, __last,
+                        [](unsigned char __c) { return __c & 0x80; });
+}
+
+/**
+ * @overload
+ *
+ * The implementation for UTF-16/32.
+ */
+template <__utf16_or_32_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
+__estimate_column_width_fast(const _CharT* __first,
+                             const _CharT* __last) noexcept {
+  return _VSTD::find_if(__first, __last,
+                        [](uint32_t __c) { return __c >= 0x1100; });
+}
+
+template <class _CharT>
+struct _LIBCPP_TEMPLATE_VIS __column_width_result {
+  /** The number of output columns. */
+  size_t __width;
+  /**
+   * The last parsed element.
+   *
+   * This limits the original output to fit in the wanted number of columns.
+   */
+  const _CharT* __ptr;
+};
+
+/**
+ * Small helper to determine the width of malformed Unicode.
+ *
+ * @note This function's only needed for UTF-8. During scanning UTF-8 there
+ * are multiple place where it can be detected that the Unicode is malformed.
+ * UTF-16 only requires 1 test and UTF-32 requires no testing.
+ */
+template <__utf8_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
+__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last,
+                                  size_t __maximum, size_t __result) noexcept {
+  size_t __size = __last - __first;
+  size_t __n = _VSTD::min(__size, __maximum);
+  return {__result + __n, __first + __n};
+}
+
+/**
+ * Determines the number of output columns needed to render the input.
+ *
+ * @note When the scanner encounters malformed Unicode it acts as-if every code
+ * unit at the end of the input is one output column. It's expected the output
+ * terminal will replace these malformed code units with a one column
+ * replacement characters.
+ *
+ * @param __first   Points to the first element of the input range.
+ * @param __last    Points beyond the last element of the input range.
+ * @param __maximum The maximum number of output columns. The returned number
+ *                  of estimated output columns will not exceed this value.
+ */
+template <__utf8_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
+__estimate_column_width(const _CharT* __first, const _CharT* __last,
+                        size_t __maximum) noexcept {
+  size_t __result = 0;
+
+  while (__first != __last) {
+    // Based on the number of leading 1 bits the number of code units in the
+    // code point can be determined. See
+    // https://en.wikipedia.org/wiki/UTF-8#Encoding
+    switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) {
+    case 0: // 1-code unit encoding: all 1 column
+      ++__result;
+      ++__first;
+      break;
+
+    case 2: // 2-code unit encoding: all 1 column
+      // Malformed Unicode.
+      if (__last - __first < 2) [[unlikely]]
+        return __estimate_column_width_malformed(__first, __last, __maximum,
+                                                 __result);
+      __first += 2;
+      ++__result;
+      break;
+
+    case 3: // 3-code unit encoding: either 1 or 2 columns
+      // Malformed Unicode.
+      if (__last - __first < 3) [[unlikely]]
+        return __estimate_column_width_malformed(__first, __last, __maximum,
+                                                 __result);
+      {
+        uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f;
+        __c <<= 6;
+        __c |= static_cast<unsigned char>(*__first++) & 0x3f;
+        __c <<= 6;
+        __c |= static_cast<unsigned char>(*__first++) & 0x3f;
+        __result += __column_width_3(__c);
+        if (__result > __maximum)
+          return {__result - 2, __first - 3};
+      }
+      break;
+    case 4: // 4-code unit encoding: either 1 or 2 columns
+      // Malformed Unicode.
+      if (__last - __first < 4) [[unlikely]]
+        return __estimate_column_width_malformed(__first, __last, __maximum,
+                                                 __result);
+      {
+        uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07;
+        __c <<= 6;
+        __c |= static_cast<unsigned char>(*__first++) & 0x3f;
+        __c <<= 6;
+        __c |= static_cast<unsigned char>(*__first++) & 0x3f;
+        __c <<= 6;
+        __c |= static_cast<unsigned char>(*__first++) & 0x3f;
+        __result += __column_width_4(__c);
+        if (__result > __maximum)
+          return {__result - 2, __first - 4};
+      }
+      break;
+    default:
+      // Malformed Unicode.
+      return __estimate_column_width_malformed(__first, __last, __maximum,
+                                               __result);
+    }
+
+    if (__result >= __maximum)
+      return {__result, __first};
+  }
+  return {__result, __first};
+}
+
+template <__utf16_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
+__estimate_column_width(const _CharT* __first, const _CharT* __last,
+                        size_t __maximum) noexcept {
+  size_t __result = 0;
+
+  while (__first != __last) {
+    uint32_t __c = *__first;
+    // Is the code unit part of a surrogate pair? See
+    // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
+    if (__c >= 0xd800 && __c <= 0xDfff) {
+      // Malformed Unicode.
+      if (__last - __first < 2) [[unlikely]]
+        return {__result + 1, __first + 1};
+
+      __c -= 0xd800;
+      __c <<= 10;
+      __c += (*(__first + 1) - 0xdc00);
+      __c += 0x10'000;
+
+      __result += __column_width_4(__c);
+      if (__result > __maximum)
+        return {__result - 2, __first};
+      __first += 2;
+    } else {
+      __result += __column_width_3(__c);
+      if (__result > __maximum)
+        return {__result - 2, __first};
+      ++__first;
+    }
+
+    if (__result >= __maximum)
+      return {__result, __first};
+  }
+
+  return {__result, __first};
+}
+
+template <__utf32_character _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
+__estimate_column_width(const _CharT* __first, const _CharT* __last,
+                        size_t __maximum) noexcept {
+  size_t __result = 0;
+
+  while (__first != __last) {
+    wchar_t __c = *__first;
+    __result += __column_width(__c);
+
+    if (__result > __maximum)
+      return {__result - 2, __first};
+
+    ++__first;
+    if (__result >= __maximum)
+      return {__result, __first};
+  }
+
+  return {__result, __first};
+}
+
+} // namespace __detail
+
+template <class _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
+__get_string_alignment(const _CharT* __first, const _CharT* __last,
+                       ptrdiff_t __width, ptrdiff_t __precision) noexcept {
+  _LIBCPP_ASSERT(__width != 0 || __precision != -1,
+                 "The function has no effect and shouldn't be used");
+
+  // TODO FMT There might be more optimizations possible:
+  // If __precision == __format::__number_max and the encoding is:
+  // * UTF-8  : 4 * (__last - __first) >= __width
+  // * UTF-16 : 2 * (__last - __first) >= __width
+  // * UTF-32 : (__last - __first) >= __width
+  // In these cases it's certain the output is at least the requested width.
+  // It's unknown how often this happens in practice. For now the improvement
+  // isn't implemented.
+
+  /*
+   * First assume there are no special Unicode code units in the input.
+   * - Apply the precision (this may reduce the size of the input). When
+   *   __precison == -1 this step is omitted.
+   * - Scan for special code units in the input.
+   * If our assumption was correct the __pos will be at the end of the input.
+   */
+  const ptrdiff_t __length = __last - __first;
+  const _CharT* __limit =
+      __first +
+      (__precision == -1 ? __length : _VSTD::min(__length, __precision));
+  ptrdiff_t __size = __limit - __first;
+  const _CharT* __pos =
+      __detail::__estimate_column_width_fast(__first, __limit);
+
+  if (__pos == __limit)
+    return {__limit, __size, __size < __width};
+
+  /*
+   * Our assumption was wrong, there are special Unicode code units.
+   * The range [__first, __pos) contains a set of code units with the
+   * following property:
+   *      Every _CharT in the range will be rendered in 1 column.
+   *
+   * If there's no maximum width and the parsed size already exceeds the
+   *   minimum required width. The real size isn't important. So bail out.
+   */
+  if (__precision == -1 && (__pos - __first) >= __width)
+    return {__last, 0, false};
+
+  /* If there's a __precision, truncate the output to that width. */
+  ptrdiff_t __prefix = __pos - __first;
+  if (__precision != -1) {
+    _LIBCPP_ASSERT(__precision > __prefix, "Logic error.");
+    auto __lengh_info = __detail::__estimate_column_width(
+        __pos, __last, __precision - __prefix);
+    __size = __lengh_info.__width + __prefix;
+    return {__lengh_info.__ptr, __size, __size < __width};
+  }
+
+  /* Else use __width to determine the number of required padding characters. */
+  _LIBCPP_ASSERT(__width > __prefix, "Logic error.");
+  /*
+   * The column width is always one or two columns. For the precision the wanted
+   * column width is the maximum, for the width it's the minimum. Using the
+   * width estimation with its truncating behavior will result in the wrong
+   * result in the following case:
+   * - The last code unit processed requires two columns and exceeds the
+   *   maximum column width.
+   * By increasing the __maximum by one avoids this issue. (It means it may
+   * pass one code point more than required to determine the proper result;
+   * that however isn't a problem for the algorithm.)
+   */
+  size_t __maximum = 1 + __width - __prefix;
+  auto __lengh_info =
+      __detail::__estimate_column_width(__pos, __last, __maximum);
+  if (__lengh_info.__ptr != __last) {
+    // Consumed the width number of code units. The exact size of the string
+    // is unknown. We only know we don't need to align the output.
+    _LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >=
+                       __width,
+                   "Logic error");
+    return {__last, 0, false};
+  }
+
+  __size = __lengh_info.__width + __prefix;
+  return {__last, __size, __size < __width};
+}
+#else  // _LIBCPP_HAS_NO_UNICODE
+template <class _CharT>
+_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
+__get_string_alignment(const _CharT* __first, const _CharT* __last,
+                       ptrdiff_t __width, ptrdiff_t __precision) noexcept {
+  const ptrdiff_t __length = __last - __first;
+  const _CharT* __limit =
+      __first +
+      (__precision == -1 ? __length : _VSTD::min(__length, __precision));
+  ptrdiff_t __size = __limit - __first;
+  return {__limit, __size, __size < __width};
+}
+#endif // _LIBCPP_HAS_NO_UNICODE
+
 } // namespace __format_spec

 # endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
@ -719,4 +1181,6 @@ protected:

 _LIBCPP_END_NAMESPACE_STD

+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_non_unicode.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_non_unicode.pass.cpp
@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-format
+
+// UTF-32 doesn't work properly
+// XFAIL: windows
+
+// <format>
+
+// Tests the Unicode width support of the standard format specifiers.
+// It tests [format.string.std]/8 - 11:
+// - Properly determining the estimated with of a unicode string.
+// - Properly truncating to the wanted maximum width.
+
+// This version runs the test when the platform doesn't have Unicode support.
+// REQUIRES: libcpp-has-no-unicode
+
+#include <format>
+#include <cassert>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+#define CSTR(S) MAKE_CSTRING(CharT, S)
+
+using namespace std::__format_spec;
+
+template <class CharT>
+constexpr bool operator==(const __string_alignment<CharT>& lhs,
+                          const __string_alignment<CharT>& rhs) noexcept {
+  return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
+         lhs.__align == rhs.__align;
+}
+
+template <class CharT>
+constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
+                                    const CharT* str, size_t width,
+                                    size_t precision) {
+  std::basic_string_view<CharT> sv{str};
+  __string_alignment<CharT> expected{sv.begin() + offset, size, align};
+  __string_alignment<CharT> traits =
+      __get_string_alignment(sv.begin(), sv.end(), width, precision);
+  assert(traits == expected);
+}
+
+template <class CharT>
+constexpr void get_string_alignment() {
+  // Truncate the input.
+  get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
+
+  // The 2-column character gets half accepted.
+  get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2);
+
+  // No alignment since the number of characters fits.
+  get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2);
+
+  // Same but for a 2-column 4-byte UTF-8 sequence
+  get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2);
+  get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2);
+
+  // No alignment required.
+  get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
+  get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
+
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
+                       3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2,
+                       -1);
+
+  // Doesn't evaluate 'c' so size -> 0
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
+                       3 + 2 * (sizeof(CharT) == 1), false,
+                       CSTR("a\u115fc") /* 2-column character */, 3, -1);
+  // Extend width
+  get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
+                       3 + 2 * (sizeof(CharT) == 1), true,
+                       CSTR("a\u1160c") /* 1-column character */, 6, -1);
+}
+
+template <class CharT>
+constexpr void test() {
+  get_string_alignment<CharT>();
+}
+
+constexpr bool test() {
+  test<char>();
+  test<wchar_t>();
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
+  test<char8_t>();
+#endif
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+  test<char16_t>();
+  test<char32_t>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_unicode.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/std_format_spec_string_unicode.pass.cpp
@ -0,0 +1,270 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-format
+
+// UTF-32 doesn't work properly
+// XFAIL: windows
+
+// <format>
+
+// Tests the Unicode width support of the standard format specifiers.
+// It tests [format.string.std]/8 - 11:
+// - Properly determining the estimated with of a unicode string.
+// - Properly truncating to the wanted maximum width.
+
+// This version runs the test when the platform has Unicode support.
+// UNSUPPORTED: libcpp-has-no-unicode
+
+#include <format>
+#include <cassert>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+#define CSTR(S) MAKE_CSTRING(CharT, S)
+
+using namespace std::__format_spec;
+
+template <class CharT>
+constexpr bool operator==(const __string_alignment<CharT>& lhs,
+                          const __string_alignment<CharT>& rhs) noexcept {
+  return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
+         lhs.__align == rhs.__align;
+}
+
+template <class CharT>
+constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
+                                    const CharT* str, size_t width,
+                                    size_t precision) {
+  std::basic_string_view<CharT> sv{str};
+  __string_alignment<CharT> expected{sv.begin() + offset, size, align};
+  __string_alignment<CharT> traits =
+      __get_string_alignment(sv.begin(), sv.end(), width, precision);
+  assert(traits == expected);
+}
+
+template <class CharT>
+constexpr void estimate_column_width_fast(size_t expected, const CharT* str) {
+  std::basic_string_view<CharT> sv{str};
+  const CharT* out =
+      __detail::__estimate_column_width_fast(sv.begin(), sv.end());
+  assert(out == sv.begin() + expected);
+}
+
+template <class CharT>
+constexpr void estimate_column_width_fast() {
+
+  // No unicode
+  estimate_column_width_fast(3, CSTR("abc"));
+  estimate_column_width_fast(3, CSTR("a\u007fc"));
+
+  if constexpr (sizeof(CharT) == 1) {
+    // UTF-8 stop at the first multi-byte character.
+    estimate_column_width_fast(0, CSTR("\u0080bc"));
+    estimate_column_width_fast(1, CSTR("a\u0080c"));
+    estimate_column_width_fast(2, CSTR("ab\u0080"));
+    estimate_column_width_fast(1, CSTR("aßc"));
+
+    estimate_column_width_fast(1, CSTR("a\u07ffc"));
+    estimate_column_width_fast(1, CSTR("a\u0800c"));
+
+    estimate_column_width_fast(1, CSTR("a\u10ffc"));
+  } else {
+    // UTF-16/32 stop at the first multi-column character.
+    estimate_column_width_fast(3, CSTR("\u0080bc"));
+    estimate_column_width_fast(3, CSTR("a\u0080c"));
+    estimate_column_width_fast(3, CSTR("ab\u0080"));
+    estimate_column_width_fast(3, CSTR("aßc"));
+
+    estimate_column_width_fast(3, CSTR("a\u07ffc"));
+    estimate_column_width_fast(3, CSTR("a\u0800c"));
+
+    estimate_column_width_fast(3, CSTR("a\u10ffc"));
+  }
+  // First 2-column character
+  estimate_column_width_fast(1, CSTR("a\u1100c"));
+
+  estimate_column_width_fast(1, CSTR("a\U0000ffffc"));
+  estimate_column_width_fast(1, CSTR("a\U00010000c"));
+  estimate_column_width_fast(1, CSTR("a\U0010FFFFc"));
+}
+
+template <class CharT>
+constexpr void estimate_column_width(size_t expected, const CharT* str) {
+  std::basic_string_view<CharT> sv{str};
+  std::__format_spec::__detail::__column_width_result<CharT> column_info =
+      __detail::__estimate_column_width(sv.begin(), sv.end(), -1);
+  assert(column_info.__width == expected);
+}
+
+template <class CharT>
+constexpr void estimate_column_width() {
+  //*** 1-byte code points ***
+  estimate_column_width(1, CSTR(" "));
+  estimate_column_width(1, CSTR("~"));
+
+  //*** 2-byte code points ***
+  estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK
+  estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN
+
+  //*** 3-byte code points ***
+  estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF
+  estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER
+
+  // 2 column ranges
+  estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK
+  estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER
+
+  estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET
+  estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET
+
+  estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT
+  estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
+
+  estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
+  estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA
+
+  estimate_column_width(2, CSTR("\uac00")); // <Hangul Syllable, First>
+  estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih
+
+  estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
+  estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
+
+  estimate_column_width(2,
+                        CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
+  estimate_column_width(
+      2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
+
+  estimate_column_width(
+      2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
+  estimate_column_width(2,
+                        CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
+
+  estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
+  estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
+
+  estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN
+  estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN
+
+  //*** 4-byte code points ***
+  estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A
+  estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character
+
+  // 2 column ranges
+  estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE
+  estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS
+  estimate_column_width(
+      2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
+  estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET
+  estimate_column_width(
+      2, CSTR("\U00020000")); // <CJK Ideograph Extension B, First>
+  estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character
+  estimate_column_width(
+      2, CSTR("\U00030000")); // <CJK Ideograph Extension G, First>
+  estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character
+}
+
+template <class CharT>
+constexpr void get_string_alignment() {
+  // Truncate the input.
+  get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
+
+  // The 2-column character gets entirely rejected.
+  get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2);
+
+  // Due to the requested width extra alignment is required.
+  get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2);
+
+  // Same but for a 2-column 4-byte UTF-8 sequence
+  get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2);
+  get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2);
+
+  // No alignment required.
+  get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
+  get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
+
+  // Special case, we have a special character already parsed and have enough
+  // withd to satisfy the minumum required width.
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"),
+                       2, -1);
+
+  // Evaluates all so size ->4
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
+                       CSTR("a\u115fc") /* 2-column character */, 3, -1);
+  // Evaluates all so size ->4
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
+                       CSTR("a\u115fc") /* 2-column character */, 4, -1);
+
+  // Evaluates all so size ->5
+  get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
+                       CSTR("a\u115fcd") /* 2-column character */, 4, -1);
+
+  // Evaluates all so size ->5
+  get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
+                       CSTR("a\u115fcd") /* 2-column character */, 5, -1);
+
+  // Extend width
+  get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
+  get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true,
+                       CSTR("a\u1160c") /* 1-column character */, 4, -1);
+
+  // In this case the threshold where the width is still determined.
+  get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"),
+                       2, -1);
+
+  // The width is no longer exactly determined.
+  get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"),
+                       1, -1);
+
+  // Extend width and truncate input.
+  get_string_alignment(1, 1, true, CSTR("abc"), 3, 1);
+
+  if constexpr (sizeof(CharT) == 1) {
+    // Corrupt UTF-8 sequence.
+    get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3);
+    get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3);
+    get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3);
+  } else if constexpr (sizeof(CharT) == 2) {
+    // Corrupt UTF-16 sequence.
+    if constexpr (std::same_as<CharT, char16_t>)
+      get_string_alignment(2, 2, false, u"a\xdddd", 0, 3);
+    else
+      // Corrupt UTF-16 wchar_t seqence.
+      get_string_alignment(2, 2, false, L"a\xdddd", 0, 3);
+  }
+  // UTF-32 doesn't combine characters, thus no corruption tests.
+}
+
+template <class CharT>
+constexpr void test() {
+  estimate_column_width_fast<CharT>();
+  estimate_column_width<CharT>();
+  get_string_alignment<CharT>();
+}
+
+constexpr bool test() {
+  test<char>();
+  test<wchar_t>();
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
+  test<char8_t>();
+#endif
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+  test<char16_t>();
+  test<char32_t>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}