[libc++][format] Implement Unicode support.

This adds the width estimation functions to the std-format-spec.

Implements parts of:
- P0645 Text Formatting
- P1868 width: clarifying units of width and precision in std::format

Reviewed By: #libc, ldionne, vitaut

Differential Revision: https://reviews.llvm.org/D103413
This commit is contained in:
Mark de Wever 2021-05-25 20:32:38 +02:00
parent f33274c7bf
commit ac7031b2b2
5 changed files with 1041 additions and 1 deletions

View File

@ -0,0 +1,196 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP_HAS_NO_UNICODE
#include <array>
#include <format>
#include "benchmark/benchmark.h"
#include "test_macros.h"
template <class CharT, size_t N>
class tester {
static constexpr size_t size_ = N - 1;
std::array<CharT, 100 * size_> data_;
public:
explicit constexpr tester(const CharT (&input)[N]) {
auto it = data_.begin();
for (int i = 0; i < 100; ++i)
it = std::copy_n(input, size_, it);
}
constexpr size_t size() const noexcept { return data_.size(); }
constexpr const CharT* begin() const noexcept { return data_.begin(); }
constexpr const CharT* end() const noexcept { return data_.end(); }
void test(benchmark::State& state) const {
for (auto _ : state)
benchmark::DoNotOptimize(std::__format_spec::__get_string_alignment(
begin(), end(), 1'000'000, 1'000'000));
state.SetItemsProcessed(state.iterations() * size());
}
};
#define TEST(u8) \
if constexpr (std::same_as<CharT, char>) { \
constexpr auto p = tester{u8}; \
p.test(state); \
} else if constexpr (std::same_as<CharT, char16_t>) { \
constexpr auto p = tester{TEST_CONCAT(u, u8)}; \
p.test(state); \
} else { \
constexpr auto p = tester{TEST_CONCAT(U, u8)}; \
p.test(state); \
}
template <class CharT>
static void BM_EstimateLengthNoMultiByte(benchmark::State& state) {
TEST("The quick brown fox jumps over the lazy dog");
}
template <class CharT>
static void BM_EstimateLengthTwoByteDE(benchmark::State& state) {
static_assert(sizeof("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == 67);
// https://en.wikipedia.org/wiki/Pangram
TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich");
}
template <class CharT>
static void BM_EstimateLengthTwoBytePL(benchmark::State& state) {
static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53);
// https://en.wikipedia.org/wiki/Pangram
TEST("Stróż pchnął kość w quiz gędźb vel fax myjń");
}
// All values below are 1100, which is is the first multi column sequence.
template <class CharT>
static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) {
static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
49);
TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
}
template <class CharT>
static void
BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) {
static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") ==
49);
TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f");
}
template <class CharT>
static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) {
static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
49);
TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
}
template <class CharT>
static void BM_EstimateLengthThreeByte(benchmark::State& state) {
static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") ==
49);
TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e");
}
template <class CharT>
static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) {
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
"\U00010004\U00010005\U00010006\U00010007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0001000c\U0001000d\U0001000e\U0001000f") == 65);
TEST("\U00010000\U00010001\U00010002\U00010003"
"\U00010004\U00010005\U00010006\U00010007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0001000c\U0001000d\U0001000e\U0001000f");
}
template <class CharT>
static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) {
static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00020008\U00020009\U0002000a\U0002000b"
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
TEST("\U00020000\U00020002\U00020002\U00020003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00020008\U00020009\U0002000a\U0002000b"
"\U0002000c\U0002000d\U0002000e\U0002000f");
}
template <class CharT>
static void BM_EstimateLengthFourByte(benchmark::State& state) {
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
TEST("\U00010000\U00010001\U00010002\U00010003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0002000c\U0002000d\U0002000e\U0002000f");
}
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t);
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);
if (benchmark::ReportUnrecognizedArguments(argc, argv))
return 1;
benchmark::RunSpecifiedBenchmarks();
}
#else
int main(int, char**) { return 0; }
#endif

View File

@ -171,7 +171,7 @@
"`P1460 <https://wg21.link/P1460>`__","LWG","Mandating the Standard Library: Clause 20 - Utilities library","Prague","* *",""
"`P1739 <https://wg21.link/P1739>`__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","* *",""
"`P1831 <https://wg21.link/P1831>`__","LWG","Deprecating volatile: library","Prague","* *",""
"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","* *",""
"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","|In Progress|",""
"`P1908 <https://wg21.link/P1908>`__","CWG","Reserving Attribute Namespaces for Future Use","Prague","* *",""
"`P1937 <https://wg21.link/P1937>`__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *",""
"`P1956 <https://wg21.link/P1956>`__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0"

1 Paper # Group Paper Name Meeting Status First released version
171 `P1460 <https://wg21.link/P1460>`__ LWG Mandating the Standard Library: Clause 20 - Utilities library Prague * *
172 `P1739 <https://wg21.link/P1739>`__ LWG Avoid template bloat for safe_ranges in combination with "subrange-y" view adaptors Prague * *
173 `P1831 <https://wg21.link/P1831>`__ LWG Deprecating volatile: library Prague * *
174 `P1868 <https://wg21.link/P1868>`__ LWG width: clarifying units of width and precision in std::format Prague * * |In Progress|
175 `P1908 <https://wg21.link/P1908>`__ CWG Reserving Attribute Namespaces for Future Use Prague * *
176 `P1937 <https://wg21.link/P1937>`__ CWG Fixing inconsistencies between constexpr and consteval functions Prague * *
177 `P1956 <https://wg21.link/P1956>`__ LWG On the names of low-level bit manipulation functions Prague |Complete| 12.0

View File

@ -10,12 +10,15 @@
#ifndef _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
#define _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
#include <__algorithm/find_if.h>
#include <__algorithm/min.h>
#include <__config>
#include <__debug>
#include <__format/format_arg.h>
#include <__format/format_error.h>
#include <__format/format_string.h>
#include <__variant/monostate.h>
#include <bit>
#include <concepts>
#include <cstdint>
#include <type_traits>
@ -24,6 +27,9 @@
# pragma GCC system_header
#endif
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER > 17
@ -711,6 +717,462 @@ protected:
// TODO FMT Add a parser for floating-point values.
// TODO FMT Add a parser for pointer values.
/** Helper struct returned from @ref __get_string_alignment. */
template <class _CharT>
struct _LIBCPP_TEMPLATE_VIS __string_alignment {
/** Points beyond the last character to write to the output. */
const _CharT* __last;
/**
* The estimated number of columns in the output or 0.
*
* Only when the output needs to be aligned it's required to know the exact
* number of columns in the output. So if the formatted output has only a
* minimum width the exact size isn't important. It's only important to know
* the minimum has been reached. The minimum width is the width specified in
* the format-spec.
*
* For example in this code @code std::format("{:10}", MyString); @endcode
* the width estimation can stop once the algorithm has determined the output
* width is 10 columns.
*
* So if:
* * @ref __align == @c true the @ref __size is the estimated number of
* columns required.
* * @ref __align == @c false the @ref __size is the estimated number of
* columns required or 0 when the estimation algorithm stopped prematurely.
*/
ptrdiff_t __size;
/**
* Does the output need to be aligned.
*
* When alignment is needed the output algorithm needs to add the proper
* padding. Else the output algorithm just needs to copy the input up to
* @ref __last.
*/
bool __align;
};
#ifndef _LIBCPP_HAS_NO_UNICODE
namespace __detail {
/**
* Unicode column width estimates.
*
* Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
* Depending on format the relation between the number of code units stored and
* the number of output columns differs. The first relation is the number of
* code units forming a code point. (The text assumes the code units are
* unsigned.)
* - UTF-8 The number of code units is between one and four. The first 127
* Unicode code points match the ASCII character set. When the highest bit is
* set it means the code point has more than one code unit.
* - UTF-16: The number of code units is between 1 and 2. When the first
* code unit is in the range [0xd800,0xdfff) it means the code point uses two
* code units.
* - UTF-32: The number of code units is always one.
*
* The code point to the number of columns isn't well defined. The code uses the
* estimations defined in [format.string.std]/11. This list might change in the
* future.
*
* The algorithm of @ref __get_string_alignment uses two different scanners:
* - The simple scanner @ref __estimate_column_width_fast. This scanner assumes
* 1 code unit is 1 column. This scanner stops when it can't be sure the
* assumption is valid:
* - UTF-8 when the code point is encoded in more than 1 code unit.
* - UTF-16 and UTF-32 when the first multi-column code point is encountered.
* (The code unit's value is lower than 0xd800 so the 2 code unit encoding
* is irrelevant for this scanner.)
* Due to these assumptions the scanner is faster than the full scanner. It
* can process all text only containing ASCII. For UTF-16/32 it can process
* most (all?) European languages. (Note the set it can process might be
* reduced in the future, due to updates in the scanning rules.)
* - The full scanner @ref __estimate_column_width. This scanner, if needed,
* converts multiple code units into one code point then converts the code
* point to a column width.
*
* See also:
* - [format.string.general]/11
* - https://en.wikipedia.org/wiki/UTF-8#Encoding
* - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
*/
/**
* The first 2 column code point.
*
* This is the point where the fast UTF-16/32 scanner needs to stop processing.
*/
inline constexpr uint32_t __two_column_code_point = 0x1100;
/** Helper concept for an UTF-8 character type. */
template <class _CharT>
concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>;
/** Helper concept for an UTF-16 character type. */
template <class _CharT>
concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>;
/** Helper concept for an UTF-32 character type. */
template <class _CharT>
concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>;
/** Helper concept for an UTF-16 or UTF-32 character type. */
template <class _CharT>
concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>;
/**
* Converts a code point to the column width.
*
* The estimations are conforming to [format.string.general]/11
*
* This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
* character.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c < 0x1'0000,
"Use __column_width_4 or __column_width for larger values");
// clang-format off
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
(__c >= 0x2329 && (__c <= 0x232a ||
(__c >= 0x2e80 && (__c <= 0x303e ||
(__c >= 0x3040 && (__c <= 0xa4cf ||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
(__c >= 0xf900 && (__c <= 0xfaff ||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
(__c >= 0xff00 && (__c <= 0xff60 ||
(__c >= 0xffe0 && (__c <= 0xffe6
))))))))))))))))))));
// clang-format on
}
/**
* @overload
*
* This version expects a value greater than or equal to 0x1'0000, which is a
* 4-byte UTF-8 character.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c >= 0x1'0000,
"Use __column_width_3 or __column_width for smaller values");
// clang-format off
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
))))))));
// clang-format on
}
/**
* @overload
*
* The general case, accepting all values.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept {
if (__c < 0x1'0000)
return __column_width_3(__c);
return __column_width_4(__c);
}
/**
* Estimate the column width for the UTF-8 sequence using the fast algorithm.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
__estimate_column_width_fast(const _CharT* __first,
const _CharT* __last) noexcept {
return _VSTD::find_if(__first, __last,
[](unsigned char __c) { return __c & 0x80; });
}
/**
* @overload
*
* The implementation for UTF-16/32.
*/
template <__utf16_or_32_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
__estimate_column_width_fast(const _CharT* __first,
const _CharT* __last) noexcept {
return _VSTD::find_if(__first, __last,
[](uint32_t __c) { return __c >= 0x1100; });
}
template <class _CharT>
struct _LIBCPP_TEMPLATE_VIS __column_width_result {
/** The number of output columns. */
size_t __width;
/**
* The last parsed element.
*
* This limits the original output to fit in the wanted number of columns.
*/
const _CharT* __ptr;
};
/**
* Small helper to determine the width of malformed Unicode.
*
* @note This function's only needed for UTF-8. During scanning UTF-8 there
* are multiple place where it can be detected that the Unicode is malformed.
* UTF-16 only requires 1 test and UTF-32 requires no testing.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last,
size_t __maximum, size_t __result) noexcept {
size_t __size = __last - __first;
size_t __n = _VSTD::min(__size, __maximum);
return {__result + __n, __first + __n};
}
/**
* Determines the number of output columns needed to render the input.
*
* @note When the scanner encounters malformed Unicode it acts as-if every code
* unit at the end of the input is one output column. It's expected the output
* terminal will replace these malformed code units with a one column
* replacement characters.
*
* @param __first Points to the first element of the input range.
* @param __last Points beyond the last element of the input range.
* @param __maximum The maximum number of output columns. The returned number
* of estimated output columns will not exceed this value.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
// Based on the number of leading 1 bits the number of code units in the
// code point can be determined. See
// https://en.wikipedia.org/wiki/UTF-8#Encoding
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) {
case 0: // 1-code unit encoding: all 1 column
++__result;
++__first;
break;
case 2: // 2-code unit encoding: all 1 column
// Malformed Unicode.
if (__last - __first < 2) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
__first += 2;
++__result;
break;
case 3: // 3-code unit encoding: either 1 or 2 columns
// Malformed Unicode.
if (__last - __first < 3) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
{
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__result += __column_width_3(__c);
if (__result > __maximum)
return {__result - 2, __first - 3};
}
break;
case 4: // 4-code unit encoding: either 1 or 2 columns
// Malformed Unicode.
if (__last - __first < 4) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
{
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__result += __column_width_4(__c);
if (__result > __maximum)
return {__result - 2, __first - 4};
}
break;
default:
// Malformed Unicode.
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
}
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
template <__utf16_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
uint32_t __c = *__first;
// Is the code unit part of a surrogate pair? See
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
if (__c >= 0xd800 && __c <= 0xDfff) {
// Malformed Unicode.
if (__last - __first < 2) [[unlikely]]
return {__result + 1, __first + 1};
__c -= 0xd800;
__c <<= 10;
__c += (*(__first + 1) - 0xdc00);
__c += 0x10'000;
__result += __column_width_4(__c);
if (__result > __maximum)
return {__result - 2, __first};
__first += 2;
} else {
__result += __column_width_3(__c);
if (__result > __maximum)
return {__result - 2, __first};
++__first;
}
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
template <__utf32_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
wchar_t __c = *__first;
__result += __column_width(__c);
if (__result > __maximum)
return {__result - 2, __first};
++__first;
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
} // namespace __detail
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
__get_string_alignment(const _CharT* __first, const _CharT* __last,
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
_LIBCPP_ASSERT(__width != 0 || __precision != -1,
"The function has no effect and shouldn't be used");
// TODO FMT There might be more optimizations possible:
// If __precision == __format::__number_max and the encoding is:
// * UTF-8 : 4 * (__last - __first) >= __width
// * UTF-16 : 2 * (__last - __first) >= __width
// * UTF-32 : (__last - __first) >= __width
// In these cases it's certain the output is at least the requested width.
// It's unknown how often this happens in practice. For now the improvement
// isn't implemented.
/*
* First assume there are no special Unicode code units in the input.
* - Apply the precision (this may reduce the size of the input). When
* __precison == -1 this step is omitted.
* - Scan for special code units in the input.
* If our assumption was correct the __pos will be at the end of the input.
*/
const ptrdiff_t __length = __last - __first;
const _CharT* __limit =
__first +
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
ptrdiff_t __size = __limit - __first;
const _CharT* __pos =
__detail::__estimate_column_width_fast(__first, __limit);
if (__pos == __limit)
return {__limit, __size, __size < __width};
/*
* Our assumption was wrong, there are special Unicode code units.
* The range [__first, __pos) contains a set of code units with the
* following property:
* Every _CharT in the range will be rendered in 1 column.
*
* If there's no maximum width and the parsed size already exceeds the
* minimum required width. The real size isn't important. So bail out.
*/
if (__precision == -1 && (__pos - __first) >= __width)
return {__last, 0, false};
/* If there's a __precision, truncate the output to that width. */
ptrdiff_t __prefix = __pos - __first;
if (__precision != -1) {
_LIBCPP_ASSERT(__precision > __prefix, "Logic error.");
auto __lengh_info = __detail::__estimate_column_width(
__pos, __last, __precision - __prefix);
__size = __lengh_info.__width + __prefix;
return {__lengh_info.__ptr, __size, __size < __width};
}
/* Else use __width to determine the number of required padding characters. */
_LIBCPP_ASSERT(__width > __prefix, "Logic error.");
/*
* The column width is always one or two columns. For the precision the wanted
* column width is the maximum, for the width it's the minimum. Using the
* width estimation with its truncating behavior will result in the wrong
* result in the following case:
* - The last code unit processed requires two columns and exceeds the
* maximum column width.
* By increasing the __maximum by one avoids this issue. (It means it may
* pass one code point more than required to determine the proper result;
* that however isn't a problem for the algorithm.)
*/
size_t __maximum = 1 + __width - __prefix;
auto __lengh_info =
__detail::__estimate_column_width(__pos, __last, __maximum);
if (__lengh_info.__ptr != __last) {
// Consumed the width number of code units. The exact size of the string
// is unknown. We only know we don't need to align the output.
_LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >=
__width,
"Logic error");
return {__last, 0, false};
}
__size = __lengh_info.__width + __prefix;
return {__last, __size, __size < __width};
}
#else // _LIBCPP_HAS_NO_UNICODE
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
__get_string_alignment(const _CharT* __first, const _CharT* __last,
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
const ptrdiff_t __length = __last - __first;
const _CharT* __limit =
__first +
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
ptrdiff_t __size = __limit - __first;
return {__limit, __size, __size < __width};
}
#endif // _LIBCPP_HAS_NO_UNICODE
} // namespace __format_spec
# endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
@ -719,4 +1181,6 @@ protected:
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
#endif // _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H

View File

@ -0,0 +1,110 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-no-concepts
// UNSUPPORTED: libcpp-has-no-incomplete-format
// UTF-32 doesn't work properly
// XFAIL: windows
// <format>
// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.
// This version runs the test when the platform doesn't have Unicode support.
// REQUIRES: libcpp-has-no-unicode
#include <format>
#include <cassert>
#include "test_macros.h"
#include "make_string.h"
#define CSTR(S) MAKE_CSTRING(CharT, S)
using namespace std::__format_spec;
template <class CharT>
constexpr bool operator==(const __string_alignment<CharT>& lhs,
const __string_alignment<CharT>& rhs) noexcept {
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
lhs.__align == rhs.__align;
}
template <class CharT>
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
const CharT* str, size_t width,
size_t precision) {
std::basic_string_view<CharT> sv{str};
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
__string_alignment<CharT> traits =
__get_string_alignment(sv.begin(), sv.end(), width, precision);
assert(traits == expected);
}
template <class CharT>
constexpr void get_string_alignment() {
// Truncate the input.
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
// The 2-column character gets half accepted.
get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2);
// No alignment since the number of characters fits.
get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2);
// Same but for a 2-column 4-byte UTF-8 sequence
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2);
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2);
// No alignment required.
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2,
-1);
// Doesn't evaluate 'c' so size -> 0
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), false,
CSTR("a\u115fc") /* 2-column character */, 3, -1);
// Extend width
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), true,
CSTR("a\u1160c") /* 1-column character */, 6, -1);
}
template <class CharT>
constexpr void test() {
get_string_alignment<CharT>();
}
constexpr bool test() {
test<char>();
test<wchar_t>();
#ifndef _LIBCPP_HAS_NO_CHAR8_T
test<char8_t>();
#endif
#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
test<char16_t>();
test<char32_t>();
#endif
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}

View File

@ -0,0 +1,270 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-no-concepts
// UNSUPPORTED: libcpp-has-no-incomplete-format
// UTF-32 doesn't work properly
// XFAIL: windows
// <format>
// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.
// This version runs the test when the platform has Unicode support.
// UNSUPPORTED: libcpp-has-no-unicode
#include <format>
#include <cassert>
#include "test_macros.h"
#include "make_string.h"
#define CSTR(S) MAKE_CSTRING(CharT, S)
using namespace std::__format_spec;
template <class CharT>
constexpr bool operator==(const __string_alignment<CharT>& lhs,
const __string_alignment<CharT>& rhs) noexcept {
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
lhs.__align == rhs.__align;
}
template <class CharT>
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
const CharT* str, size_t width,
size_t precision) {
std::basic_string_view<CharT> sv{str};
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
__string_alignment<CharT> traits =
__get_string_alignment(sv.begin(), sv.end(), width, precision);
assert(traits == expected);
}
template <class CharT>
constexpr void estimate_column_width_fast(size_t expected, const CharT* str) {
std::basic_string_view<CharT> sv{str};
const CharT* out =
__detail::__estimate_column_width_fast(sv.begin(), sv.end());
assert(out == sv.begin() + expected);
}
template <class CharT>
constexpr void estimate_column_width_fast() {
// No unicode
estimate_column_width_fast(3, CSTR("abc"));
estimate_column_width_fast(3, CSTR("a\u007fc"));
if constexpr (sizeof(CharT) == 1) {
// UTF-8 stop at the first multi-byte character.
estimate_column_width_fast(0, CSTR("\u0080bc"));
estimate_column_width_fast(1, CSTR("a\u0080c"));
estimate_column_width_fast(2, CSTR("ab\u0080"));
estimate_column_width_fast(1, CSTR("aßc"));
estimate_column_width_fast(1, CSTR("a\u07ffc"));
estimate_column_width_fast(1, CSTR("a\u0800c"));
estimate_column_width_fast(1, CSTR("a\u10ffc"));
} else {
// UTF-16/32 stop at the first multi-column character.
estimate_column_width_fast(3, CSTR("\u0080bc"));
estimate_column_width_fast(3, CSTR("a\u0080c"));
estimate_column_width_fast(3, CSTR("ab\u0080"));
estimate_column_width_fast(3, CSTR("aßc"));
estimate_column_width_fast(3, CSTR("a\u07ffc"));
estimate_column_width_fast(3, CSTR("a\u0800c"));
estimate_column_width_fast(3, CSTR("a\u10ffc"));
}
// First 2-column character
estimate_column_width_fast(1, CSTR("a\u1100c"));
estimate_column_width_fast(1, CSTR("a\U0000ffffc"));
estimate_column_width_fast(1, CSTR("a\U00010000c"));
estimate_column_width_fast(1, CSTR("a\U0010FFFFc"));
}
template <class CharT>
constexpr void estimate_column_width(size_t expected, const CharT* str) {
std::basic_string_view<CharT> sv{str};
std::__format_spec::__detail::__column_width_result<CharT> column_info =
__detail::__estimate_column_width(sv.begin(), sv.end(), -1);
assert(column_info.__width == expected);
}
template <class CharT>
constexpr void estimate_column_width() {
//*** 1-byte code points ***
estimate_column_width(1, CSTR(" "));
estimate_column_width(1, CSTR("~"));
//*** 2-byte code points ***
estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK
estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN
//*** 3-byte code points ***
estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF
estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER
// 2 column ranges
estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK
estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER
estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET
estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET
estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT
estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA
estimate_column_width(2, CSTR("\uac00")); // <Hangul Syllable, First>
estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih
estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
estimate_column_width(2,
CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
estimate_column_width(
2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
estimate_column_width(
2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
estimate_column_width(2,
CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN
estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN
//*** 4-byte code points ***
estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A
estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character
// 2 column ranges
estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE
estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS
estimate_column_width(
2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET
estimate_column_width(
2, CSTR("\U00020000")); // <CJK Ideograph Extension B, First>
estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character
estimate_column_width(
2, CSTR("\U00030000")); // <CJK Ideograph Extension G, First>
estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character
}
template <class CharT>
constexpr void get_string_alignment() {
// Truncate the input.
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
// The 2-column character gets entirely rejected.
get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2);
// Due to the requested width extra alignment is required.
get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2);
// Same but for a 2-column 4-byte UTF-8 sequence
get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2);
get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2);
// No alignment required.
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
// Special case, we have a special character already parsed and have enough
// withd to satisfy the minumum required width.
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"),
2, -1);
// Evaluates all so size ->4
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
CSTR("a\u115fc") /* 2-column character */, 3, -1);
// Evaluates all so size ->4
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
CSTR("a\u115fc") /* 2-column character */, 4, -1);
// Evaluates all so size ->5
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
CSTR("a\u115fcd") /* 2-column character */, 4, -1);
// Evaluates all so size ->5
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
CSTR("a\u115fcd") /* 2-column character */, 5, -1);
// Extend width
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true,
CSTR("a\u1160c") /* 1-column character */, 4, -1);
// In this case the threshold where the width is still determined.
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"),
2, -1);
// The width is no longer exactly determined.
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"),
1, -1);
// Extend width and truncate input.
get_string_alignment(1, 1, true, CSTR("abc"), 3, 1);
if constexpr (sizeof(CharT) == 1) {
// Corrupt UTF-8 sequence.
get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3);
get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3);
get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3);
} else if constexpr (sizeof(CharT) == 2) {
// Corrupt UTF-16 sequence.
if constexpr (std::same_as<CharT, char16_t>)
get_string_alignment(2, 2, false, u"a\xdddd", 0, 3);
else
// Corrupt UTF-16 wchar_t seqence.
get_string_alignment(2, 2, false, L"a\xdddd", 0, 3);
}
// UTF-32 doesn't combine characters, thus no corruption tests.
}
template <class CharT>
constexpr void test() {
estimate_column_width_fast<CharT>();
estimate_column_width<CharT>();
get_string_alignment<CharT>();
}
constexpr bool test() {
test<char>();
test<wchar_t>();
#ifndef _LIBCPP_HAS_NO_CHAR8_T
test<char8_t>();
#endif
#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
test<char16_t>();
test<char32_t>();
#endif
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}