Re-land c346068928 with fixes

It was previously reverted in a6beb18b84
due to test failures.
This commit is contained in:
Marcus Johnson 2022-03-23 08:12:14 -04:00 committed by Aaron Ballman
parent 3b74aac29c
commit d14ccbc2e8
3 changed files with 124 additions and 23 deletions

View File

@ -126,6 +126,9 @@ typedef unsigned char Boolean; /* 0 or 1 */
#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
#define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
#define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
typedef enum {
conversionOK, /* conversion successful */
sourceExhausted, /* partial character in source, but hit end */
@ -281,6 +284,24 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
*/
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
/**
* Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
*
* \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
* \param [out] Out Converted UTF-8 is stored here on success.
* \returns true on success
*/
bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
/**
* Converts a UTF32 string into a UTF8 std::string.
*
* \param [in] Src A buffer of UTF-32 encoded text.
* \param [out] Out Converted UTF-8 is stored here on success.
* \returns true on success
*/
bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
/**
* Converts a UTF-8 string into a UTF-16 string with native endianness.
*

View File

@ -34,31 +34,31 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
UTF16 *targetStart = reinterpret_cast<UTF16 *>(ResultPtr);
ConversionFlags flags = strictConversion;
result = ConvertUTF8toUTF16(
&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + Source.size(), flags);
result =
ConvertUTF8toUTF16(&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
ResultPtr = reinterpret_cast<char *>(targetStart);
else
ErrorPtr = sourceStart;
} else if (WideCharWidth == 4) {
const UTF8 *sourceStart = (const UTF8*)Source.data();
const UTF8 *sourceStart = (const UTF8 *)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
UTF32 *targetStart = reinterpret_cast<UTF32 *>(ResultPtr);
ConversionFlags flags = strictConversion;
result = ConvertUTF8toUTF32(
&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + Source.size(), flags);
result =
ConvertUTF8toUTF32(&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
ResultPtr = reinterpret_cast<char *>(targetStart);
else
ErrorPtr = sourceStart;
}
assert((result != targetExhausted)
&& "ConvertUTF8toUTFXX exhausted target buffer");
assert((result != targetExhausted) &&
"ConvertUTF8toUTFXX exhausted target buffer");
return result == conversionOK;
}
@ -67,20 +67,18 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
const UTF32 *SourceEnd = SourceStart + 1;
UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
UTF8 *TargetEnd = TargetStart + 4;
ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
&TargetStart, TargetEnd,
strictConversion);
ConversionResult CR = ConvertUTF32toUTF8(
&SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion);
if (CR != conversionOK)
return false;
ResultPtr = reinterpret_cast<char*>(TargetStart);
ResultPtr = reinterpret_cast<char *>(TargetStart);
return true;
}
bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
return (S.size() >= 2 &&
((S[0] == '\xff' && S[1] == '\xfe') ||
(S[0] == '\xfe' && S[1] == '\xff')));
return (S.size() >= 2 && ((S[0] == '\xff' && S[1] == '\xfe') ||
(S[0] == '\xfe' && S[1] == '\xff')));
}
bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
@ -134,11 +132,69 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
return true;
}
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
{
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {
return convertUTF16ToUTF8String(
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
Src.size() * sizeof(UTF16)), Out);
Src.size() * sizeof(UTF16)),
Out);
}
bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
assert(Out.empty());
// Error out on an uneven byte count.
if (SrcBytes.size() % 4)
return false;
// Avoid OOB by returning early on empty input.
if (SrcBytes.empty())
return true;
const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());
const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());
assert((uintptr_t)Src % sizeof(UTF32) == 0);
// Byteswap if necessary.
std::vector<UTF32> ByteSwapped;
if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
for (UTF32 &I : ByteSwapped)
I = llvm::ByteSwap_32(I);
Src = &ByteSwapped[0];
SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
}
// Skip the BOM for conversion.
if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
Src++;
// Just allocate enough space up front. We'll shrink it later. Allocate
// enough that we can fit a null terminator without reallocating.
Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
UTF8 *DstEnd = Dst + Out.size();
ConversionResult CR =
ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
assert(CR != targetExhausted);
if (CR != conversionOK) {
Out.clear();
return false;
}
Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
Out.push_back(0);
Out.pop_back();
return true;
}
bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
return convertUTF32ToUTF8String(
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
Src.size() * sizeof(UTF32)),
Out);
}
bool convertUTF8ToUTF16String(StringRef SrcUTF8,

View File

@ -25,6 +25,18 @@ TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
EXPECT_EQ(Expected, Result);
}
TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) {
// Src is the look of disapproval.
alignas(UTF32) static const char Src[] =
"\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00";
ArrayRef<char> Ref(Src, sizeof(Src) - 1);
std::string Result;
bool Success = convertUTF32ToUTF8String(Ref, Result);
EXPECT_TRUE(Success);
std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
EXPECT_EQ(Expected, Result);
}
TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
// Src is the look of disapproval.
alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
@ -36,6 +48,18 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
EXPECT_EQ(Expected, Result);
}
TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) {
// Src is the look of disapproval.
alignas(UTF32) static const char Src[] =
"\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0";
ArrayRef<char> Ref(Src, sizeof(Src) - 1);
std::string Result;
bool Success = convertUTF32ToUTF8String(Ref, Result);
EXPECT_TRUE(Success);
std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
EXPECT_EQ(Expected, Result);
}
TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
// Src is the look of disapproval.
static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";