forked from OSchip/llvm-project
More UTF string conversion wrappers
Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247
This commit is contained in:
parent
47c3a4743e
commit
7423f40674
|
@ -197,6 +197,25 @@ namespace llvm {
|
|||
bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
|
||||
char *&ResultPtr, const UTF8 *&ErrorPtr);
|
||||
|
||||
/**
|
||||
* Converts a UTF-8 StringRef to a std::wstring.
|
||||
* \return true on success.
|
||||
*/
|
||||
bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
|
||||
|
||||
/**
|
||||
* Converts a UTF-8 C-string to a std::wstring.
|
||||
* \return true on success.
|
||||
*/
|
||||
bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
|
||||
|
||||
/**
|
||||
* Converts a std::wstring to a UTF-8 encoded std::string.
|
||||
* \return true on success.
|
||||
*/
|
||||
bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
|
||||
|
||||
|
||||
/**
|
||||
* Convert an Unicode code point to UTF8 sequence.
|
||||
*
|
||||
|
@ -251,6 +270,15 @@ bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
|
|||
*/
|
||||
bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
|
||||
|
||||
/**
|
||||
* Converts a UTF16 string into a UTF8 std::string.
|
||||
*
|
||||
* \param [in] Src A buffer of UTF-16 encoded text.
|
||||
* \param [out] Out Converted UTF-8 is stored here on success.
|
||||
* \returns true on success
|
||||
*/
|
||||
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
|
||||
|
||||
/**
|
||||
* Converts a UTF-8 string into a UTF-16 string with native endianness.
|
||||
*
|
||||
|
|
|
@ -787,9 +787,28 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
|
|||
assert(envVar && "Environment variable name missing");
|
||||
|
||||
// Get the environment variable they want us to parse options out of.
|
||||
#ifdef _WIN32
|
||||
std::wstring wenvVar;
|
||||
if (!llvm::ConvertUTF8toWide(envVar, wenvVar)) {
|
||||
assert(false &&
|
||||
"Unicode conversion of environment variable name failed");
|
||||
return;
|
||||
}
|
||||
const wchar_t *wenvValue = _wgetenv(wenvVar.c_str());
|
||||
if (!wenvValue)
|
||||
return;
|
||||
std::string envValueBuffer;
|
||||
if (!llvm::convertWideToUTF8(wenvValue, envValueBuffer)) {
|
||||
assert(false &&
|
||||
"Unicode conversion of environment variable value failed");
|
||||
return;
|
||||
}
|
||||
const char *envValue = envValueBuffer.c_str();
|
||||
#else
|
||||
const char *envValue = getenv(envVar);
|
||||
if (!envValue)
|
||||
return;
|
||||
#endif
|
||||
|
||||
// Get program's "name", which we wouldn't know without the caller
|
||||
// telling us.
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/SwapByteOrder.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -36,7 +37,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
|
|||
ConversionFlags flags = strictConversion;
|
||||
result = ConvertUTF8toUTF16(
|
||||
&sourceStart, sourceStart + Source.size(),
|
||||
&targetStart, targetStart + 2*Source.size(), flags);
|
||||
&targetStart, targetStart + Source.size(), flags);
|
||||
if (result == conversionOK)
|
||||
ResultPtr = reinterpret_cast<char*>(targetStart);
|
||||
else
|
||||
|
@ -49,7 +50,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
|
|||
ConversionFlags flags = strictConversion;
|
||||
result = ConvertUTF8toUTF32(
|
||||
&sourceStart, sourceStart + Source.size(),
|
||||
&targetStart, targetStart + 4*Source.size(), flags);
|
||||
&targetStart, targetStart + Source.size(), flags);
|
||||
if (result == conversionOK)
|
||||
ResultPtr = reinterpret_cast<char*>(targetStart);
|
||||
else
|
||||
|
@ -130,6 +131,13 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
|
||||
{
|
||||
return convertUTF16ToUTF8String(
|
||||
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
|
||||
Src.size() * sizeof(UTF16)), Out);
|
||||
}
|
||||
|
||||
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
|
||||
SmallVectorImpl<UTF16> &DstUTF16) {
|
||||
assert(DstUTF16.empty());
|
||||
|
@ -168,5 +176,74 @@ bool convertUTF8ToUTF16String(StringRef SrcUTF8,
|
|||
return true;
|
||||
}
|
||||
|
||||
static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
|
||||
sizeof(wchar_t) == 4,
|
||||
"Expected wchar_t to be 1, 2, or 4 bytes");
|
||||
|
||||
template <typename TResult>
|
||||
static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
|
||||
TResult &Result) {
|
||||
// Even in the case of UTF-16, the number of bytes in a UTF-8 string is
|
||||
// at least as large as the number of elements in the resulting wide
|
||||
// string, because surrogate pairs take at least 4 bytes in UTF-8.
|
||||
Result.resize(Source.size() + 1);
|
||||
char *ResultPtr = reinterpret_cast<char *>(&Result[0]);
|
||||
const UTF8 *ErrorPtr;
|
||||
if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
|
||||
Result.clear();
|
||||
return false;
|
||||
}
|
||||
Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
|
||||
return ConvertUTF8toWideInternal(Source, Result);
|
||||
}
|
||||
|
||||
bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
|
||||
if (!Source) {
|
||||
Result.clear();
|
||||
return true;
|
||||
}
|
||||
return ConvertUTF8toWide(llvm::StringRef(Source), Result);
|
||||
}
|
||||
|
||||
bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
|
||||
if (sizeof(wchar_t) == 1) {
|
||||
const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());
|
||||
const UTF8 *End =
|
||||
reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
|
||||
if (!isLegalUTF8String(&Start, End))
|
||||
return false;
|
||||
Result.resize(Source.size());
|
||||
memcpy(&Result[0], Source.data(), Source.size());
|
||||
return true;
|
||||
} else if (sizeof(wchar_t) == 2) {
|
||||
return convertUTF16ToUTF8String(
|
||||
llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
|
||||
Source.size()),
|
||||
Result);
|
||||
} else if (sizeof(wchar_t) == 4) {
|
||||
const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());
|
||||
const UTF32 *End =
|
||||
reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
|
||||
Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
|
||||
UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);
|
||||
UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());
|
||||
if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
|
||||
strictConversion) == conversionOK) {
|
||||
Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
|
||||
return true;
|
||||
} else {
|
||||
Result.clear();
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
llvm_unreachable(
|
||||
"Control should never reach this point; see static_assert further up");
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ TEST(ConvertUTFTest, OddLengthInput) {
|
|||
|
||||
TEST(ConvertUTFTest, Empty) {
|
||||
std::string Result;
|
||||
bool Success = convertUTF16ToUTF8String(None, Result);
|
||||
bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
|
||||
EXPECT_TRUE(Success);
|
||||
EXPECT_TRUE(Result.empty());
|
||||
}
|
||||
|
@ -80,6 +80,41 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
|
|||
EXPECT_FALSE(HasBOM);
|
||||
}
|
||||
|
||||
TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
|
||||
// Src is the look of disapproval.
|
||||
static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
|
||||
ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
|
||||
std::string Result;
|
||||
bool Success = convertUTF16ToUTF8String(SrcRef, Result);
|
||||
EXPECT_TRUE(Success);
|
||||
std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
|
||||
EXPECT_EQ(Expected, Result);
|
||||
}
|
||||
|
||||
TEST(ConvertUTFTest, ConvertUTF8toWide) {
|
||||
// Src is the look of disapproval.
|
||||
static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
|
||||
std::wstring Result;
|
||||
bool Success = ConvertUTF8toWide((const char*)Src, Result);
|
||||
EXPECT_TRUE(Success);
|
||||
std::wstring Expected(L"\x0ca0_\x0ca0");
|
||||
EXPECT_EQ(Expected, Result);
|
||||
Result.clear();
|
||||
Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
|
||||
EXPECT_TRUE(Success);
|
||||
EXPECT_EQ(Expected, Result);
|
||||
}
|
||||
|
||||
TEST(ConvertUTFTest, convertWideToUTF8) {
|
||||
// Src is the look of disapproval.
|
||||
static const wchar_t Src[] = L"\x0ca0_\x0ca0";
|
||||
std::string Result;
|
||||
bool Success = convertWideToUTF8(Src, Result);
|
||||
EXPECT_TRUE(Success);
|
||||
std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
|
||||
EXPECT_EQ(Expected, Result);
|
||||
}
|
||||
|
||||
struct ConvertUTFResultContainer {
|
||||
ConversionResult ErrorCode;
|
||||
std::vector<unsigned> UnicodeScalars;
|
||||
|
|
Loading…
Reference in New Issue