forked from OSchip/llvm-project
Add UTF32 to/from UTF8 conversion functions
This is anticipated to be used in new format specifier checking code.
This commit is contained in:
parent
7f8572b8c3
commit
c346068928
|
@ -126,6 +126,9 @@ typedef unsigned char Boolean; /* 0 or 1 */
|
||||||
#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
|
#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
|
||||||
#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
|
#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
|
||||||
|
|
||||||
|
#define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
|
||||||
|
#define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
conversionOK, /* conversion successful */
|
conversionOK, /* conversion successful */
|
||||||
sourceExhausted, /* partial character in source, but hit end */
|
sourceExhausted, /* partial character in source, but hit end */
|
||||||
|
@ -281,6 +284,24 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
|
||||||
*/
|
*/
|
||||||
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
|
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
|
||||||
|
*
|
||||||
|
* \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
|
||||||
|
* \param [out] Out Converted UTF-8 is stored here on success.
|
||||||
|
* \returns true on success
|
||||||
|
*/
|
||||||
|
bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a UTF32 string into a UTF8 std::string.
|
||||||
|
*
|
||||||
|
* \param [in] Src A buffer of UTF-32 encoded text.
|
||||||
|
* \param [out] Out Converted UTF-8 is stored here on success.
|
||||||
|
* \returns true on success
|
||||||
|
*/
|
||||||
|
bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a UTF-8 string into a UTF-16 string with native endianness.
|
* Converts a UTF-8 string into a UTF-16 string with native endianness.
|
||||||
*
|
*
|
||||||
|
|
|
@ -141,6 +141,62 @@ bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
|
||||||
Src.size() * sizeof(UTF16)), Out);
|
Src.size() * sizeof(UTF16)), Out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
|
||||||
|
assert(Out.empty());
|
||||||
|
|
||||||
|
// Avoid OOB by returning early on empty input.
|
||||||
|
if (SrcBytes.empty() &&
|
||||||
|
(SrcBytes.size() % 4) != 0) // Assume multiple of 4 bytes; Unicode's max
|
||||||
|
// num of code units in UTF-8.
|
||||||
|
return true;
|
||||||
|
|
||||||
|
const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());
|
||||||
|
const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());
|
||||||
|
|
||||||
|
assert((uintptr_t)Src % sizeof(UTF32) == 0);
|
||||||
|
|
||||||
|
// Byteswap if necessary.
|
||||||
|
std::vector<UTF32> ByteSwapped;
|
||||||
|
if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
|
||||||
|
ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
|
||||||
|
for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
|
||||||
|
ByteSwapped[I] = llvm::ByteSwap_32(ByteSwapped[I]);
|
||||||
|
Src = &ByteSwapped[0];
|
||||||
|
SrcEnd = Src + ByteSwapped.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip the BOM for conversion.
|
||||||
|
if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
|
||||||
|
Src++;
|
||||||
|
|
||||||
|
// Just allocate enough space up front. We'll shrink it later. Allocate
|
||||||
|
// enough that we can fit a null terminator without reallocating.
|
||||||
|
Out.resize(SrcBytes.size() + 1); //
|
||||||
|
UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
|
||||||
|
UTF8 *DstEnd = Dst + Out.size();
|
||||||
|
|
||||||
|
ConversionResult CR =
|
||||||
|
ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
|
||||||
|
assert(CR != targetExhausted);
|
||||||
|
|
||||||
|
if (CR != conversionOK) {
|
||||||
|
Out.clear();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
|
||||||
|
Out.push_back(0);
|
||||||
|
Out.pop_back();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
|
||||||
|
return convertUTF32ToUTF8String(
|
||||||
|
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
|
||||||
|
Src.size() * sizeof(UTF32)),
|
||||||
|
Out);
|
||||||
|
}
|
||||||
|
|
||||||
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
|
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
|
||||||
SmallVectorImpl<UTF16> &DstUTF16) {
|
SmallVectorImpl<UTF16> &DstUTF16) {
|
||||||
assert(DstUTF16.empty());
|
assert(DstUTF16.empty());
|
||||||
|
|
|
@ -25,6 +25,17 @@ TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
|
||||||
EXPECT_EQ(Expected, Result);
|
EXPECT_EQ(Expected, Result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) {
|
||||||
|
// Src is a crystal ball.
|
||||||
|
alignas(UTF32) static const char Src[] = "\x2E\xF5\x01\x00";
|
||||||
|
ArrayRef<char> Ref(Src, sizeof(Src) - 1);
|
||||||
|
std::string Result;
|
||||||
|
bool Success = convertUTF32ToUTF8String(Ref, Result);
|
||||||
|
EXPECT_TRUE(Success);
|
||||||
|
std::string Expected("\xF0\x9F\x94\xAE");
|
||||||
|
EXPECT_EQ(Expected, Result);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
|
TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
|
||||||
// Src is the look of disapproval.
|
// Src is the look of disapproval.
|
||||||
alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
|
alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
|
||||||
|
@ -36,6 +47,17 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
|
||||||
EXPECT_EQ(Expected, Result);
|
EXPECT_EQ(Expected, Result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) {
|
||||||
|
// Src is a crystal ball.
|
||||||
|
alignas(UTF32) static const char Src[] = "\x00\x00\xfe\xff\x00\x01\xF5\x2E";
|
||||||
|
ArrayRef<char> Ref(Src, sizeof(Src) - 1);
|
||||||
|
std::string Result;
|
||||||
|
bool Success = convertUTF32ToUTF8String(Ref, Result);
|
||||||
|
EXPECT_TRUE(Success);
|
||||||
|
std::string Expected("\xF0\x9F\x94\xAE");
|
||||||
|
EXPECT_EQ(Expected, Result);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
|
TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
|
||||||
// Src is the look of disapproval.
|
// Src is the look of disapproval.
|
||||||
static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
|
static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
|
||||||
|
|
Loading…
Reference in New Issue