[libc++] Use bit field for checking if string is in long or short mode

This makes the code a bit simpler and (I think) removes the undefined behaviour from the normal string layout.

Reviewed By: ldionne, Mordante, #libc

Spies: labath, dblaikie, JDevlieghere, krytarowski, jgorbe, jingham, saugustine, arichardson, libcxx-commits

Differential Revision: https://reviews.llvm.org/D123580
This commit is contained in:
Nikolas Klauser 2022-04-20 10:52:04 +02:00
parent 1056c56786
commit 29c8c070a1
4 changed files with 198 additions and 93 deletions

View File

@ -532,6 +532,8 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
#include <__utility/auto_cast.h> #include <__utility/auto_cast.h>
#include <__utility/move.h> #include <__utility/move.h>
#include <__utility/swap.h> #include <__utility/swap.h>
#include <__utility/unreachable.h>
#include <climits>
#include <compare> #include <compare>
#include <cstdio> // EOF #include <cstdio> // EOF
#include <cstdlib> #include <cstdlib>
@ -539,6 +541,7 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
#include <initializer_list> #include <initializer_list>
#include <iosfwd> #include <iosfwd>
#include <iterator> #include <iterator>
#include <limits>
#include <memory> #include <memory>
#include <stdexcept> #include <stdexcept>
#include <string_view> #include <string_view>
@ -674,17 +677,10 @@ private:
{ {
pointer __data_; pointer __data_;
size_type __size_; size_type __size_;
size_type __cap_; size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
size_type __is_long_ : 1;
}; };
#ifdef _LIBCPP_BIG_ENDIAN
static const size_type __short_mask = 0x01;
static const size_type __long_mask = 0x1ul;
#else // _LIBCPP_BIG_ENDIAN
static const size_type __short_mask = 0x80;
static const size_type __long_mask = ~(size_type(~0) >> 1);
#endif // _LIBCPP_BIG_ENDIAN
enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ? enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
(sizeof(__long) - 1)/sizeof(value_type) : 2}; (sizeof(__long) - 1)/sizeof(value_type) : 2};
@ -692,26 +688,46 @@ private:
{ {
value_type __data_[__min_cap]; value_type __data_[__min_cap];
unsigned char __padding[sizeof(value_type) - 1]; unsigned char __padding[sizeof(value_type) - 1];
unsigned char __size_; unsigned char __size_ : 7;
unsigned char __is_long_ : 1;
}; };
// The __endian_factor is required because the field we use to store the size
// (either size_type or unsigned char depending on long/short) has one fewer
// bit than it would if it were not a bitfield.
//
// If the LSB is used to store the short-flag in the short string representation,
// we have to multiply the size by two when it is stored and divide it by two when
// it is loaded to make sure that we always store an even number. In the long string
// representation, we can ignore this because we can assume that we always allocate
// an even amount of value_types.
//
// If the MSB is used for the short-flag, the max_size() is numeric_limits<size_type>::max() / 2.
// This does not impact the short string representation, since we never need the MSB
// for representing the size of a short string anyway.
#ifdef _LIBCPP_BIG_ENDIAN
static const size_type __endian_factor = 2;
#else #else
static const size_type __endian_factor = 1;
#endif
#else // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
#ifdef _LIBCPP_BIG_ENDIAN
static const size_type __endian_factor = 1;
#else
static const size_type __endian_factor = 2;
#endif
struct __long struct __long
{ {
size_type __cap_; size_type __is_long_ : 1;
size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
size_type __size_; size_type __size_;
pointer __data_; pointer __data_;
}; };
#ifdef _LIBCPP_BIG_ENDIAN
static const size_type __short_mask = 0x80;
static const size_type __long_mask = ~(size_type(~0) >> 1);
#else // _LIBCPP_BIG_ENDIAN
static const size_type __short_mask = 0x01;
static const size_type __long_mask = 0x1ul;
#endif // _LIBCPP_BIG_ENDIAN
enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ? enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
(sizeof(__long) - 1)/sizeof(value_type) : 2}; (sizeof(__long) - 1)/sizeof(value_type) : 2};
@ -719,7 +735,10 @@ private:
{ {
union union
{ {
unsigned char __size_; struct {
unsigned char __is_long_ : 1;
unsigned char __size_ : 7;
};
value_type __lx; value_type __lx;
}; };
value_type __data_[__min_cap]; value_type __data_[__min_cap];
@ -1426,8 +1445,9 @@ public:
_LIBCPP_INLINE_VISIBILITY void __shrink_or_extend(size_type __target_capacity); _LIBCPP_INLINE_VISIBILITY void __shrink_or_extend(size_type __target_capacity);
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
bool __is_long() const _NOEXCEPT bool __is_long() const _NOEXCEPT {
{return bool(__r_.first().__s.__size_ & __short_mask);} return __r_.first().__s.__is_long_;
}
#if _LIBCPP_DEBUG_LEVEL == 2 #if _LIBCPP_DEBUG_LEVEL == 2
@ -1474,43 +1494,18 @@ private:
_LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return __r_.second(); } _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return __r_.second(); }
_LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); } _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); }
#ifdef _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT _LIBCPP_INLINE_VISIBILITY
void __set_short_size(size_type __s) _NOEXCEPT {
_LIBCPP_ASSERT(__s < __min_cap, "__s should never be greater than or equal to the short string capacity");
__r_.first().__s.__size_ = __s;
__r_.first().__s.__is_long_ = false;
}
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
void __set_short_size(size_type __s) _NOEXCEPT size_type __get_short_size() const _NOEXCEPT {
# ifdef _LIBCPP_BIG_ENDIAN _LIBCPP_ASSERT(!__r_.first().__s.__is_long_, "String has to be short when trying to get the short size");
{__r_.first().__s.__size_ = (unsigned char)(__s << 1);} return __r_.first().__s.__size_;
# else }
{__r_.first().__s.__size_ = (unsigned char)(__s);}
# endif
_LIBCPP_INLINE_VISIBILITY
size_type __get_short_size() const _NOEXCEPT
# ifdef _LIBCPP_BIG_ENDIAN
{return __r_.first().__s.__size_ >> 1;}
# else
{return __r_.first().__s.__size_;}
# endif
#else // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
_LIBCPP_INLINE_VISIBILITY
void __set_short_size(size_type __s) _NOEXCEPT
# ifdef _LIBCPP_BIG_ENDIAN
{__r_.first().__s.__size_ = (unsigned char)(__s);}
# else
{__r_.first().__s.__size_ = (unsigned char)(__s << 1);}
# endif
_LIBCPP_INLINE_VISIBILITY
size_type __get_short_size() const _NOEXCEPT
# ifdef _LIBCPP_BIG_ENDIAN
{return __r_.first().__s.__size_;}
# else
{return __r_.first().__s.__size_ >> 1;}
# endif
#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
void __set_long_size(size_type __s) _NOEXCEPT void __set_long_size(size_type __s) _NOEXCEPT
@ -1523,11 +1518,15 @@ private:
{if (__is_long()) __set_long_size(__s); else __set_short_size(__s);} {if (__is_long()) __set_long_size(__s); else __set_short_size(__s);}
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
void __set_long_cap(size_type __s) _NOEXCEPT void __set_long_cap(size_type __s) _NOEXCEPT {
{__r_.first().__l.__cap_ = __long_mask | __s;} __r_.first().__l.__cap_ = __s / __endian_factor;
__r_.first().__l.__is_long_ = true;
}
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
size_type __get_long_cap() const _NOEXCEPT size_type __get_long_cap() const _NOEXCEPT {
{return __r_.first().__l.__cap_ & size_type(~__long_mask);} return __r_.first().__l.__cap_ * __endian_factor;
}
_LIBCPP_INLINE_VISIBILITY _LIBCPP_INLINE_VISIBILITY
void __set_long_pointer(pointer __p) _NOEXCEPT void __set_long_pointer(pointer __p) _NOEXCEPT
@ -3225,11 +3224,12 @@ typename basic_string<_CharT, _Traits, _Allocator>::size_type
basic_string<_CharT, _Traits, _Allocator>::max_size() const _NOEXCEPT basic_string<_CharT, _Traits, _Allocator>::max_size() const _NOEXCEPT
{ {
size_type __m = __alloc_traits::max_size(__alloc()); size_type __m = __alloc_traits::max_size(__alloc());
#ifdef _LIBCPP_BIG_ENDIAN if (__m <= std::numeric_limits<size_type>::max() / 2) {
return (__m <= ~__long_mask ? __m : __m/2) - __alignment; return __m - __alignment;
#else } else {
return __m - __alignment; bool __uses_lsb = __endian_factor == 2;
#endif return __uses_lsb ? __m - __alignment : (__m / 2) - __alignment;
}
} }
template <class _CharT, class _Traits, class _Allocator> template <class _CharT, class _Traits, class _Allocator>

View File

@ -0,0 +1,125 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED:
// <string>
// This test ensures that the correct max_size() is returned depending on the platform.
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <string>
#include "test_macros.h"
// alignment of the string heap buffer is hardcoded to 16
static const size_t alignment = 16;
void full_size() {
std::string str;
assert(str.max_size() == std::numeric_limits<size_t>::max() - alignment);
#ifndef TEST_HAS_NO_CHAR8_T
std::u8string u8str;
assert(u8str.max_size() == std::numeric_limits<size_t>::max() - alignment);
#endif
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
std::wstring wstr;
assert(wstr.max_size() == std::numeric_limits<size_t>::max() / sizeof(wchar_t) - alignment);
#endif
#ifndef TEST_HAS_NO_UNICODE_CHARS
std::u16string u16str;
std::u32string u32str;
assert(u16str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
assert(u32str.max_size() == std::numeric_limits<size_t>::max() / 4 - alignment);
#endif
}
void half_size() {
std::string str;
assert(str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
#ifndef TEST_HAS_NO_CHAR8_T
std::u8string u8str;
assert(u8str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
#endif
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
std::wstring wstr;
assert(wstr.max_size() == std::numeric_limits<size_t>::max() / std::max<size_t>(2ul, sizeof(wchar_t)) - alignment);
#endif
#ifndef TEST_HAS_NO_UNICODE_CHARS
std::u16string u16str;
std::u32string u32str;
assert(u16str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
assert(u32str.max_size() == std::numeric_limits<size_t>::max() / 4 - alignment);
#endif
}
bool test() {
#if _LIBCPP_ABI_VERSION == 1
#ifdef __APPLE__
# ifdef __aarch64__
half_size();
# elif defined(__x86_64__)
full_size();
# else
# error "Your target system seems to be unsupported."
# endif
#elif defined(__linux__)
# ifdef __x86_64__
full_size();
# elif defined(__arm__) || defined(__aarch64__)
# ifdef __BIG_ENDIAN__
half_size();
# else
full_size();
# endif
# else
# error "Your target system seems to be unsupported."
# endif
#elif defined(__powerpc__)
half_size();
#elif defined(__powepc64__)
half_size();
#elif defined(_WIN64)
# ifdef __x86_64__
full_size();
# else
# error "Your target system seems to be unsupported."
# endif
#else
# error "Your target system seems to be unsupported."
#endif
#endif
return true;
}
int main(int, char**) {
test();
#if TEST_STD_VER > 17
// static_assert(test());
#endif
return 0;
}

View File

@ -9,6 +9,11 @@
// UNSUPPORTED: no-exceptions // UNSUPPORTED: no-exceptions
// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11}} // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11}}
// Prior to http://llvm.org/D123580, there was a bug with how the max_size()
// was calculated. That was inlined into some functions in the dylib, which leads
// to failures when running this test against an older system dylib.
// XFAIL: use_system_cxx_lib && target=arm64-apple-macosx{{11.0|12.0}}
// <string> // <string>
// size_type max_size() const; // size_type max_size() const;

View File

@ -192,26 +192,6 @@ def _value_of_pair_first(value):
class StdStringPrinter(object): class StdStringPrinter(object):
"""Print a std::string.""" """Print a std::string."""
def _get_short_size(self, short_field, short_size):
"""Short size depends on both endianness and a compile-time define."""
# If the padding field is present after all this indirection, then string
# was compiled with _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT defined.
field = short_field.type.fields()[1].type.fields()[0]
libcpp_abi_alternate_string_layout = field.name and "__padding" in field.name
# This logical structure closely follows the original code (which is clearer
# in C++). Keep them parallel to make them easier to compare.
if libcpp_abi_alternate_string_layout:
if _libcpp_big_endian:
return short_size >> 1
else:
return short_size
elif _libcpp_big_endian:
return short_size
else:
return short_size >> 1
def __init__(self, val): def __init__(self, val):
self.val = val self.val = val
@ -223,18 +203,13 @@ class StdStringPrinter(object):
short_size = short_field["__size_"] short_size = short_field["__size_"]
if short_size == 0: if short_size == 0:
return "" return ""
short_mask = self.val["__short_mask"] if short_field["__is_long_"]:
# Counter intuitive to compare the size and short_mask to see if the string
# is long, but that's the way the implementation does it. Note that
# __is_long() doesn't use get_short_size in C++.
is_long = short_size & short_mask
if is_long:
long_field = value_field["__l"] long_field = value_field["__l"]
data = long_field["__data_"] data = long_field["__data_"]
size = long_field["__size_"] size = long_field["__size_"]
else: else:
data = short_field["__data_"] data = short_field["__data_"]
size = self._get_short_size(short_field, short_size) size = short_field["__size_"]
return data.lazy_string(length=size) return data.lazy_string(length=size)
def display_hint(self): def display_hint(self):