forked from OSchip/llvm-project
[libc++][format] Implement Unicode support.
This adds the width estimation functions to the std-format-spec. Implements parts of: - P0645 Text Formatting - P1868 width: clarifying units of width and precision in std::format Reviewed By: #libc, ldionne, vitaut Differential Revision: https://reviews.llvm.org/D103413
This commit is contained in:
parent
f33274c7bf
commit
ac7031b2b2
|
@ -0,0 +1,196 @@
|
|||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
#include <array>
|
||||
#include <format>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
|
||||
#include "test_macros.h"
|
||||
|
||||
template <class CharT, size_t N>
|
||||
class tester {
|
||||
static constexpr size_t size_ = N - 1;
|
||||
std::array<CharT, 100 * size_> data_;
|
||||
|
||||
public:
|
||||
explicit constexpr tester(const CharT (&input)[N]) {
|
||||
auto it = data_.begin();
|
||||
for (int i = 0; i < 100; ++i)
|
||||
it = std::copy_n(input, size_, it);
|
||||
}
|
||||
|
||||
constexpr size_t size() const noexcept { return data_.size(); }
|
||||
constexpr const CharT* begin() const noexcept { return data_.begin(); }
|
||||
constexpr const CharT* end() const noexcept { return data_.end(); }
|
||||
|
||||
void test(benchmark::State& state) const {
|
||||
for (auto _ : state)
|
||||
benchmark::DoNotOptimize(std::__format_spec::__get_string_alignment(
|
||||
begin(), end(), 1'000'000, 1'000'000));
|
||||
state.SetItemsProcessed(state.iterations() * size());
|
||||
}
|
||||
};
|
||||
|
||||
#define TEST(u8) \
|
||||
if constexpr (std::same_as<CharT, char>) { \
|
||||
constexpr auto p = tester{u8}; \
|
||||
p.test(state); \
|
||||
} else if constexpr (std::same_as<CharT, char16_t>) { \
|
||||
constexpr auto p = tester{TEST_CONCAT(u, u8)}; \
|
||||
p.test(state); \
|
||||
} else { \
|
||||
constexpr auto p = tester{TEST_CONCAT(U, u8)}; \
|
||||
p.test(state); \
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthNoMultiByte(benchmark::State& state) {
|
||||
TEST("The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthTwoByteDE(benchmark::State& state) {
|
||||
static_assert(sizeof("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == 67);
|
||||
|
||||
// https://en.wikipedia.org/wiki/Pangram
|
||||
TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthTwoBytePL(benchmark::State& state) {
|
||||
static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53);
|
||||
|
||||
// https://en.wikipedia.org/wiki/Pangram
|
||||
TEST("Stróż pchnął kość w quiz gędźb vel fax myjń");
|
||||
}
|
||||
|
||||
// All values below are 1100, which is is the first multi column sequence.
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) {
|
||||
static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
|
||||
49);
|
||||
|
||||
TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void
|
||||
BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
|
||||
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") ==
|
||||
49);
|
||||
|
||||
TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
|
||||
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
|
||||
49);
|
||||
|
||||
TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByte(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
|
||||
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") ==
|
||||
49);
|
||||
|
||||
TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
|
||||
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00010004\U00010005\U00010006\U00010007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0001000c\U0001000d\U0001000e\U0001000f") == 65);
|
||||
|
||||
TEST("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00010004\U00010005\U00010006\U00010007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0001000c\U0001000d\U0001000e\U0001000f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00020008\U00020009\U0002000a\U0002000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
|
||||
|
||||
TEST("\U00020000\U00020002\U00020002\U00020003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00020008\U00020009\U0002000a\U0002000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByte(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
|
||||
|
||||
TEST("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f");
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
benchmark::Initialize(&argc, argv);
|
||||
if (benchmark::ReportUnrecognizedArguments(argc, argv))
|
||||
return 1;
|
||||
|
||||
benchmark::RunSpecifiedBenchmarks();
|
||||
}
|
||||
#else
|
||||
int main(int, char**) { return 0; }
|
||||
#endif
|
|
@ -171,7 +171,7 @@
|
|||
"`P1460 <https://wg21.link/P1460>`__","LWG","Mandating the Standard Library: Clause 20 - Utilities library","Prague","* *",""
|
||||
"`P1739 <https://wg21.link/P1739>`__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","* *",""
|
||||
"`P1831 <https://wg21.link/P1831>`__","LWG","Deprecating volatile: library","Prague","* *",""
|
||||
"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","* *",""
|
||||
"`P1868 <https://wg21.link/P1868>`__","LWG","width: clarifying units of width and precision in std::format","Prague","|In Progress|",""
|
||||
"`P1908 <https://wg21.link/P1908>`__","CWG","Reserving Attribute Namespaces for Future Use","Prague","* *",""
|
||||
"`P1937 <https://wg21.link/P1937>`__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *",""
|
||||
"`P1956 <https://wg21.link/P1956>`__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0"
|
||||
|
|
|
|
@ -10,12 +10,15 @@
|
|||
#ifndef _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
|
||||
#define _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
|
||||
|
||||
#include <__algorithm/find_if.h>
|
||||
#include <__algorithm/min.h>
|
||||
#include <__config>
|
||||
#include <__debug>
|
||||
#include <__format/format_arg.h>
|
||||
#include <__format/format_error.h>
|
||||
#include <__format/format_string.h>
|
||||
#include <__variant/monostate.h>
|
||||
#include <bit>
|
||||
#include <concepts>
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
@ -24,6 +27,9 @@
|
|||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_PUSH_MACROS
|
||||
#include <__undef_macros>
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
#if _LIBCPP_STD_VER > 17
|
||||
|
@ -711,6 +717,462 @@ protected:
|
|||
// TODO FMT Add a parser for floating-point values.
|
||||
// TODO FMT Add a parser for pointer values.
|
||||
|
||||
/** Helper struct returned from @ref __get_string_alignment. */
|
||||
template <class _CharT>
|
||||
struct _LIBCPP_TEMPLATE_VIS __string_alignment {
|
||||
/** Points beyond the last character to write to the output. */
|
||||
const _CharT* __last;
|
||||
/**
|
||||
* The estimated number of columns in the output or 0.
|
||||
*
|
||||
* Only when the output needs to be aligned it's required to know the exact
|
||||
* number of columns in the output. So if the formatted output has only a
|
||||
* minimum width the exact size isn't important. It's only important to know
|
||||
* the minimum has been reached. The minimum width is the width specified in
|
||||
* the format-spec.
|
||||
*
|
||||
* For example in this code @code std::format("{:10}", MyString); @endcode
|
||||
* the width estimation can stop once the algorithm has determined the output
|
||||
* width is 10 columns.
|
||||
*
|
||||
* So if:
|
||||
* * @ref __align == @c true the @ref __size is the estimated number of
|
||||
* columns required.
|
||||
* * @ref __align == @c false the @ref __size is the estimated number of
|
||||
* columns required or 0 when the estimation algorithm stopped prematurely.
|
||||
*/
|
||||
ptrdiff_t __size;
|
||||
/**
|
||||
* Does the output need to be aligned.
|
||||
*
|
||||
* When alignment is needed the output algorithm needs to add the proper
|
||||
* padding. Else the output algorithm just needs to copy the input up to
|
||||
* @ref __last.
|
||||
*/
|
||||
bool __align;
|
||||
};
|
||||
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
namespace __detail {
|
||||
|
||||
/**
|
||||
* Unicode column width estimates.
|
||||
*
|
||||
* Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
|
||||
* Depending on format the relation between the number of code units stored and
|
||||
* the number of output columns differs. The first relation is the number of
|
||||
* code units forming a code point. (The text assumes the code units are
|
||||
* unsigned.)
|
||||
* - UTF-8 The number of code units is between one and four. The first 127
|
||||
* Unicode code points match the ASCII character set. When the highest bit is
|
||||
* set it means the code point has more than one code unit.
|
||||
* - UTF-16: The number of code units is between 1 and 2. When the first
|
||||
* code unit is in the range [0xd800,0xdfff) it means the code point uses two
|
||||
* code units.
|
||||
* - UTF-32: The number of code units is always one.
|
||||
*
|
||||
* The code point to the number of columns isn't well defined. The code uses the
|
||||
* estimations defined in [format.string.std]/11. This list might change in the
|
||||
* future.
|
||||
*
|
||||
* The algorithm of @ref __get_string_alignment uses two different scanners:
|
||||
* - The simple scanner @ref __estimate_column_width_fast. This scanner assumes
|
||||
* 1 code unit is 1 column. This scanner stops when it can't be sure the
|
||||
* assumption is valid:
|
||||
* - UTF-8 when the code point is encoded in more than 1 code unit.
|
||||
* - UTF-16 and UTF-32 when the first multi-column code point is encountered.
|
||||
* (The code unit's value is lower than 0xd800 so the 2 code unit encoding
|
||||
* is irrelevant for this scanner.)
|
||||
* Due to these assumptions the scanner is faster than the full scanner. It
|
||||
* can process all text only containing ASCII. For UTF-16/32 it can process
|
||||
* most (all?) European languages. (Note the set it can process might be
|
||||
* reduced in the future, due to updates in the scanning rules.)
|
||||
* - The full scanner @ref __estimate_column_width. This scanner, if needed,
|
||||
* converts multiple code units into one code point then converts the code
|
||||
* point to a column width.
|
||||
*
|
||||
* See also:
|
||||
* - [format.string.general]/11
|
||||
* - https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
* - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
*/
|
||||
|
||||
/**
|
||||
* The first 2 column code point.
|
||||
*
|
||||
* This is the point where the fast UTF-16/32 scanner needs to stop processing.
|
||||
*/
|
||||
inline constexpr uint32_t __two_column_code_point = 0x1100;
|
||||
|
||||
/** Helper concept for an UTF-8 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>;
|
||||
|
||||
/** Helper concept for an UTF-16 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>;
|
||||
|
||||
/** Helper concept for an UTF-32 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>;
|
||||
|
||||
/** Helper concept for an UTF-16 or UTF-32 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>;
|
||||
|
||||
/**
|
||||
* Converts a code point to the column width.
|
||||
*
|
||||
* The estimations are conforming to [format.string.general]/11
|
||||
*
|
||||
* This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
|
||||
* character.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c < 0x1'0000,
|
||||
"Use __column_width_4 or __column_width for larger values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
|
||||
(__c >= 0x2329 && (__c <= 0x232a ||
|
||||
(__c >= 0x2e80 && (__c <= 0x303e ||
|
||||
(__c >= 0x3040 && (__c <= 0xa4cf ||
|
||||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
|
||||
(__c >= 0xf900 && (__c <= 0xfaff ||
|
||||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
|
||||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
|
||||
(__c >= 0xff00 && (__c <= 0xff60 ||
|
||||
(__c >= 0xffe0 && (__c <= 0xffe6
|
||||
))))))))))))))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* This version expects a value greater than or equal to 0x1'0000, which is a
|
||||
* 4-byte UTF-8 character.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c >= 0x1'0000,
|
||||
"Use __column_width_3 or __column_width for smaller values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
|
||||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
|
||||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
|
||||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
|
||||
))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* The general case, accepting all values.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept {
|
||||
if (__c < 0x1'0000)
|
||||
return __column_width_3(__c);
|
||||
|
||||
return __column_width_4(__c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the column width for the UTF-8 sequence using the fast algorithm.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
|
||||
__estimate_column_width_fast(const _CharT* __first,
|
||||
const _CharT* __last) noexcept {
|
||||
return _VSTD::find_if(__first, __last,
|
||||
[](unsigned char __c) { return __c & 0x80; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* The implementation for UTF-16/32.
|
||||
*/
|
||||
template <__utf16_or_32_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
|
||||
__estimate_column_width_fast(const _CharT* __first,
|
||||
const _CharT* __last) noexcept {
|
||||
return _VSTD::find_if(__first, __last,
|
||||
[](uint32_t __c) { return __c >= 0x1100; });
|
||||
}
|
||||
|
||||
template <class _CharT>
|
||||
struct _LIBCPP_TEMPLATE_VIS __column_width_result {
|
||||
/** The number of output columns. */
|
||||
size_t __width;
|
||||
/**
|
||||
* The last parsed element.
|
||||
*
|
||||
* This limits the original output to fit in the wanted number of columns.
|
||||
*/
|
||||
const _CharT* __ptr;
|
||||
};
|
||||
|
||||
/**
|
||||
* Small helper to determine the width of malformed Unicode.
|
||||
*
|
||||
* @note This function's only needed for UTF-8. During scanning UTF-8 there
|
||||
* are multiple place where it can be detected that the Unicode is malformed.
|
||||
* UTF-16 only requires 1 test and UTF-32 requires no testing.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum, size_t __result) noexcept {
|
||||
size_t __size = __last - __first;
|
||||
size_t __n = _VSTD::min(__size, __maximum);
|
||||
return {__result + __n, __first + __n};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the number of output columns needed to render the input.
|
||||
*
|
||||
* @note When the scanner encounters malformed Unicode it acts as-if every code
|
||||
* unit at the end of the input is one output column. It's expected the output
|
||||
* terminal will replace these malformed code units with a one column
|
||||
* replacement characters.
|
||||
*
|
||||
* @param __first Points to the first element of the input range.
|
||||
* @param __last Points beyond the last element of the input range.
|
||||
* @param __maximum The maximum number of output columns. The returned number
|
||||
* of estimated output columns will not exceed this value.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
// Based on the number of leading 1 bits the number of code units in the
|
||||
// code point can be determined. See
|
||||
// https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) {
|
||||
case 0: // 1-code unit encoding: all 1 column
|
||||
++__result;
|
||||
++__first;
|
||||
break;
|
||||
|
||||
case 2: // 2-code unit encoding: all 1 column
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 2) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
__first += 2;
|
||||
++__result;
|
||||
break;
|
||||
|
||||
case 3: // 3-code unit encoding: either 1 or 2 columns
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 3) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
{
|
||||
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__result += __column_width_3(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first - 3};
|
||||
}
|
||||
break;
|
||||
case 4: // 4-code unit encoding: either 1 or 2 columns
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 4) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
{
|
||||
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__result += __column_width_4(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first - 4};
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Malformed Unicode.
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
}
|
||||
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
template <__utf16_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
uint32_t __c = *__first;
|
||||
// Is the code unit part of a surrogate pair? See
|
||||
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
if (__c >= 0xd800 && __c <= 0xDfff) {
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 2) [[unlikely]]
|
||||
return {__result + 1, __first + 1};
|
||||
|
||||
__c -= 0xd800;
|
||||
__c <<= 10;
|
||||
__c += (*(__first + 1) - 0xdc00);
|
||||
__c += 0x10'000;
|
||||
|
||||
__result += __column_width_4(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
__first += 2;
|
||||
} else {
|
||||
__result += __column_width_3(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
++__first;
|
||||
}
|
||||
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
template <__utf32_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
wchar_t __c = *__first;
|
||||
__result += __column_width(__c);
|
||||
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
|
||||
++__first;
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
} // namespace __detail
|
||||
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
|
||||
__get_string_alignment(const _CharT* __first, const _CharT* __last,
|
||||
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
|
||||
_LIBCPP_ASSERT(__width != 0 || __precision != -1,
|
||||
"The function has no effect and shouldn't be used");
|
||||
|
||||
// TODO FMT There might be more optimizations possible:
|
||||
// If __precision == __format::__number_max and the encoding is:
|
||||
// * UTF-8 : 4 * (__last - __first) >= __width
|
||||
// * UTF-16 : 2 * (__last - __first) >= __width
|
||||
// * UTF-32 : (__last - __first) >= __width
|
||||
// In these cases it's certain the output is at least the requested width.
|
||||
// It's unknown how often this happens in practice. For now the improvement
|
||||
// isn't implemented.
|
||||
|
||||
/*
|
||||
* First assume there are no special Unicode code units in the input.
|
||||
* - Apply the precision (this may reduce the size of the input). When
|
||||
* __precison == -1 this step is omitted.
|
||||
* - Scan for special code units in the input.
|
||||
* If our assumption was correct the __pos will be at the end of the input.
|
||||
*/
|
||||
const ptrdiff_t __length = __last - __first;
|
||||
const _CharT* __limit =
|
||||
__first +
|
||||
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
|
||||
ptrdiff_t __size = __limit - __first;
|
||||
const _CharT* __pos =
|
||||
__detail::__estimate_column_width_fast(__first, __limit);
|
||||
|
||||
if (__pos == __limit)
|
||||
return {__limit, __size, __size < __width};
|
||||
|
||||
/*
|
||||
* Our assumption was wrong, there are special Unicode code units.
|
||||
* The range [__first, __pos) contains a set of code units with the
|
||||
* following property:
|
||||
* Every _CharT in the range will be rendered in 1 column.
|
||||
*
|
||||
* If there's no maximum width and the parsed size already exceeds the
|
||||
* minimum required width. The real size isn't important. So bail out.
|
||||
*/
|
||||
if (__precision == -1 && (__pos - __first) >= __width)
|
||||
return {__last, 0, false};
|
||||
|
||||
/* If there's a __precision, truncate the output to that width. */
|
||||
ptrdiff_t __prefix = __pos - __first;
|
||||
if (__precision != -1) {
|
||||
_LIBCPP_ASSERT(__precision > __prefix, "Logic error.");
|
||||
auto __lengh_info = __detail::__estimate_column_width(
|
||||
__pos, __last, __precision - __prefix);
|
||||
__size = __lengh_info.__width + __prefix;
|
||||
return {__lengh_info.__ptr, __size, __size < __width};
|
||||
}
|
||||
|
||||
/* Else use __width to determine the number of required padding characters. */
|
||||
_LIBCPP_ASSERT(__width > __prefix, "Logic error.");
|
||||
/*
|
||||
* The column width is always one or two columns. For the precision the wanted
|
||||
* column width is the maximum, for the width it's the minimum. Using the
|
||||
* width estimation with its truncating behavior will result in the wrong
|
||||
* result in the following case:
|
||||
* - The last code unit processed requires two columns and exceeds the
|
||||
* maximum column width.
|
||||
* By increasing the __maximum by one avoids this issue. (It means it may
|
||||
* pass one code point more than required to determine the proper result;
|
||||
* that however isn't a problem for the algorithm.)
|
||||
*/
|
||||
size_t __maximum = 1 + __width - __prefix;
|
||||
auto __lengh_info =
|
||||
__detail::__estimate_column_width(__pos, __last, __maximum);
|
||||
if (__lengh_info.__ptr != __last) {
|
||||
// Consumed the width number of code units. The exact size of the string
|
||||
// is unknown. We only know we don't need to align the output.
|
||||
_LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >=
|
||||
__width,
|
||||
"Logic error");
|
||||
return {__last, 0, false};
|
||||
}
|
||||
|
||||
__size = __lengh_info.__width + __prefix;
|
||||
return {__last, __size, __size < __width};
|
||||
}
|
||||
#else // _LIBCPP_HAS_NO_UNICODE
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
|
||||
__get_string_alignment(const _CharT* __first, const _CharT* __last,
|
||||
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
|
||||
const ptrdiff_t __length = __last - __first;
|
||||
const _CharT* __limit =
|
||||
__first +
|
||||
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
|
||||
ptrdiff_t __size = __limit - __first;
|
||||
return {__limit, __size, __size < __width};
|
||||
}
|
||||
#endif // _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
} // namespace __format_spec
|
||||
|
||||
# endif // !defined(_LIBCPP_HAS_NO_CONCEPTS)
|
||||
|
@ -719,4 +1181,6 @@ protected:
|
|||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
_LIBCPP_POP_MACROS
|
||||
|
||||
#endif // _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-no-concepts
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// UTF-32 doesn't work properly
|
||||
// XFAIL: windows
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the Unicode width support of the standard format specifiers.
|
||||
// It tests [format.string.std]/8 - 11:
|
||||
// - Properly determining the estimated with of a unicode string.
|
||||
// - Properly truncating to the wanted maximum width.
|
||||
|
||||
// This version runs the test when the platform doesn't have Unicode support.
|
||||
// REQUIRES: libcpp-has-no-unicode
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
|
||||
#include "test_macros.h"
|
||||
#include "make_string.h"
|
||||
|
||||
#define CSTR(S) MAKE_CSTRING(CharT, S)
|
||||
|
||||
using namespace std::__format_spec;
|
||||
|
||||
template <class CharT>
|
||||
constexpr bool operator==(const __string_alignment<CharT>& lhs,
|
||||
const __string_alignment<CharT>& rhs) noexcept {
|
||||
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
|
||||
lhs.__align == rhs.__align;
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
|
||||
const CharT* str, size_t width,
|
||||
size_t precision) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
|
||||
__string_alignment<CharT> traits =
|
||||
__get_string_alignment(sv.begin(), sv.end(), width, precision);
|
||||
assert(traits == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment() {
|
||||
// Truncate the input.
|
||||
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
|
||||
|
||||
// The 2-column character gets half accepted.
|
||||
get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2);
|
||||
|
||||
// No alignment since the number of characters fits.
|
||||
get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2);
|
||||
|
||||
// Same but for a 2-column 4-byte UTF-8 sequence
|
||||
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2);
|
||||
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2);
|
||||
|
||||
// No alignment required.
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
|
||||
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2,
|
||||
-1);
|
||||
|
||||
// Doesn't evaluate 'c' so size -> 0
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 3, -1);
|
||||
// Extend width
|
||||
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), true,
|
||||
CSTR("a\u1160c") /* 1-column character */, 6, -1);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void test() {
|
||||
get_string_alignment<CharT>();
|
||||
}
|
||||
|
||||
constexpr bool test() {
|
||||
test<char>();
|
||||
test<wchar_t>();
|
||||
#ifndef _LIBCPP_HAS_NO_CHAR8_T
|
||||
test<char8_t>();
|
||||
#endif
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
|
||||
test<char16_t>();
|
||||
test<char32_t>();
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test();
|
||||
static_assert(test());
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,270 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-no-concepts
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// UTF-32 doesn't work properly
|
||||
// XFAIL: windows
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the Unicode width support of the standard format specifiers.
|
||||
// It tests [format.string.std]/8 - 11:
|
||||
// - Properly determining the estimated with of a unicode string.
|
||||
// - Properly truncating to the wanted maximum width.
|
||||
|
||||
// This version runs the test when the platform has Unicode support.
|
||||
// UNSUPPORTED: libcpp-has-no-unicode
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
|
||||
#include "test_macros.h"
|
||||
#include "make_string.h"
|
||||
|
||||
#define CSTR(S) MAKE_CSTRING(CharT, S)
|
||||
|
||||
using namespace std::__format_spec;
|
||||
|
||||
template <class CharT>
|
||||
constexpr bool operator==(const __string_alignment<CharT>& lhs,
|
||||
const __string_alignment<CharT>& rhs) noexcept {
|
||||
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
|
||||
lhs.__align == rhs.__align;
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
|
||||
const CharT* str, size_t width,
|
||||
size_t precision) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
|
||||
__string_alignment<CharT> traits =
|
||||
__get_string_alignment(sv.begin(), sv.end(), width, precision);
|
||||
assert(traits == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width_fast(size_t expected, const CharT* str) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
const CharT* out =
|
||||
__detail::__estimate_column_width_fast(sv.begin(), sv.end());
|
||||
assert(out == sv.begin() + expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width_fast() {
|
||||
|
||||
// No unicode
|
||||
estimate_column_width_fast(3, CSTR("abc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u007fc"));
|
||||
|
||||
if constexpr (sizeof(CharT) == 1) {
|
||||
// UTF-8 stop at the first multi-byte character.
|
||||
estimate_column_width_fast(0, CSTR("\u0080bc"));
|
||||
estimate_column_width_fast(1, CSTR("a\u0080c"));
|
||||
estimate_column_width_fast(2, CSTR("ab\u0080"));
|
||||
estimate_column_width_fast(1, CSTR("aßc"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\u07ffc"));
|
||||
estimate_column_width_fast(1, CSTR("a\u0800c"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\u10ffc"));
|
||||
} else {
|
||||
// UTF-16/32 stop at the first multi-column character.
|
||||
estimate_column_width_fast(3, CSTR("\u0080bc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u0080c"));
|
||||
estimate_column_width_fast(3, CSTR("ab\u0080"));
|
||||
estimate_column_width_fast(3, CSTR("aßc"));
|
||||
|
||||
estimate_column_width_fast(3, CSTR("a\u07ffc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u0800c"));
|
||||
|
||||
estimate_column_width_fast(3, CSTR("a\u10ffc"));
|
||||
}
|
||||
// First 2-column character
|
||||
estimate_column_width_fast(1, CSTR("a\u1100c"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\U0000ffffc"));
|
||||
estimate_column_width_fast(1, CSTR("a\U00010000c"));
|
||||
estimate_column_width_fast(1, CSTR("a\U0010FFFFc"));
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width(size_t expected, const CharT* str) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
std::__format_spec::__detail::__column_width_result<CharT> column_info =
|
||||
__detail::__estimate_column_width(sv.begin(), sv.end(), -1);
|
||||
assert(column_info.__width == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width() {
|
||||
//*** 1-byte code points ***
|
||||
estimate_column_width(1, CSTR(" "));
|
||||
estimate_column_width(1, CSTR("~"));
|
||||
|
||||
//*** 2-byte code points ***
|
||||
estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK
|
||||
estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3-byte code points ***
|
||||
estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF
|
||||
estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK
|
||||
estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER
|
||||
|
||||
estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET
|
||||
estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT
|
||||
estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
|
||||
estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA
|
||||
|
||||
estimate_column_width(2, CSTR("\uac00")); // <Hangul Syllable, First>
|
||||
estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih
|
||||
|
||||
estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
estimate_column_width(2,
|
||||
CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
estimate_column_width(
|
||||
2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
estimate_column_width(
|
||||
2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
estimate_column_width(2,
|
||||
CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN
|
||||
estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 4-byte code points ***
|
||||
estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A
|
||||
estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE
|
||||
estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS
|
||||
estimate_column_width(
|
||||
2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET
|
||||
estimate_column_width(
|
||||
2, CSTR("\U00020000")); // <CJK Ideograph Extension B, First>
|
||||
estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character
|
||||
estimate_column_width(
|
||||
2, CSTR("\U00030000")); // <CJK Ideograph Extension G, First>
|
||||
estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment() {
|
||||
// Truncate the input.
|
||||
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
|
||||
|
||||
// The 2-column character gets entirely rejected.
|
||||
get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2);
|
||||
|
||||
// Due to the requested width extra alignment is required.
|
||||
get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2);
|
||||
|
||||
// Same but for a 2-column 4-byte UTF-8 sequence
|
||||
get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2);
|
||||
get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2);
|
||||
|
||||
// No alignment required.
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
|
||||
|
||||
// Special case, we have a special character already parsed and have enough
|
||||
// withd to satisfy the minumum required width.
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"),
|
||||
2, -1);
|
||||
|
||||
// Evaluates all so size ->4
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 3, -1);
|
||||
// Evaluates all so size ->4
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 4, -1);
|
||||
|
||||
// Evaluates all so size ->5
|
||||
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
|
||||
CSTR("a\u115fcd") /* 2-column character */, 4, -1);
|
||||
|
||||
// Evaluates all so size ->5
|
||||
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
|
||||
CSTR("a\u115fcd") /* 2-column character */, 5, -1);
|
||||
|
||||
// Extend width
|
||||
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true,
|
||||
CSTR("a\u1160c") /* 1-column character */, 4, -1);
|
||||
|
||||
// In this case the threshold where the width is still determined.
|
||||
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"),
|
||||
2, -1);
|
||||
|
||||
// The width is no longer exactly determined.
|
||||
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"),
|
||||
1, -1);
|
||||
|
||||
// Extend width and truncate input.
|
||||
get_string_alignment(1, 1, true, CSTR("abc"), 3, 1);
|
||||
|
||||
if constexpr (sizeof(CharT) == 1) {
|
||||
// Corrupt UTF-8 sequence.
|
||||
get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3);
|
||||
get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3);
|
||||
get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3);
|
||||
} else if constexpr (sizeof(CharT) == 2) {
|
||||
// Corrupt UTF-16 sequence.
|
||||
if constexpr (std::same_as<CharT, char16_t>)
|
||||
get_string_alignment(2, 2, false, u"a\xdddd", 0, 3);
|
||||
else
|
||||
// Corrupt UTF-16 wchar_t seqence.
|
||||
get_string_alignment(2, 2, false, L"a\xdddd", 0, 3);
|
||||
}
|
||||
// UTF-32 doesn't combine characters, thus no corruption tests.
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void test() {
|
||||
estimate_column_width_fast<CharT>();
|
||||
estimate_column_width<CharT>();
|
||||
get_string_alignment<CharT>();
|
||||
}
|
||||
|
||||
constexpr bool test() {
|
||||
test<char>();
|
||||
test<wchar_t>();
|
||||
#ifndef _LIBCPP_HAS_NO_CHAR8_T
|
||||
test<char8_t>();
|
||||
#endif
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
|
||||
test<char16_t>();
|
||||
test<char32_t>();
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test();
|
||||
static_assert(test());
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue