forked from OSchip/llvm-project
[Support] Harded JSON against invalid UTF-8.
Parsing invalid UTF-8 input is now a parse error. Creating JSON values from invalid UTF-8 now triggers an assertion, and (in no-assert builds) substitutes the unicode replacement character. Strings retrieved from json::Value are always valid UTF-8. llvm-svn: 336657
This commit is contained in:
parent
ce5c19b623
commit
e6057bc689
|
@ -88,6 +88,17 @@ inline bool isAlpha(char C) {
|
|||
/// lowercase letter as classified by "C" locale.
|
||||
inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
|
||||
|
||||
/// Checks whether character \p C is valid ASCII (high bit is zero).
|
||||
inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
|
||||
|
||||
/// Checks whether all characters in S are ASCII.
|
||||
inline bool isASCII(llvm::StringRef S) {
|
||||
for (char C : S)
|
||||
if (LLVM_UNLIKELY(!isASCII(C)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Returns the corresponding lowercase character if \p x is uppercase.
|
||||
inline char toLower(char x) {
|
||||
if (x >= 'A' && x <= 'Z')
|
||||
|
|
|
@ -54,6 +54,30 @@
|
|||
|
||||
namespace llvm {
|
||||
namespace json {
|
||||
|
||||
// === String encodings ===
|
||||
//
|
||||
// JSON strings are character sequences (not byte sequences like std::string).
|
||||
// We need to know the encoding, and for simplicity only support UTF-8.
|
||||
//
|
||||
// - When parsing, invalid UTF-8 is a syntax error like any other
|
||||
//
|
||||
// - When creating Values from strings, callers must ensure they are UTF-8.
|
||||
// with asserts on, invalid UTF-8 will crash the program
|
||||
// with asserts off, we'll substitute the replacement character (U+FFFD)
|
||||
// Callers can use json::isUTF8() and json::fixUTF8() for validation.
|
||||
//
|
||||
// - When retrieving strings from Values (e.g. asString()), the result will
|
||||
// always be valid UTF-8.
|
||||
|
||||
/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
|
||||
/// If it returns false, \p Offset is set to a byte offset near the first error.
|
||||
bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
|
||||
/// Replaces invalid UTF-8 sequences in \p S with the replacement character
|
||||
/// (U+FFFD). The returned string is valid UTF-8.
|
||||
/// This is much slower than isUTF8, so test that first.
|
||||
std::string fixUTF8(llvm::StringRef S);
|
||||
|
||||
class Array;
|
||||
class ObjectKey;
|
||||
class Value;
|
||||
|
@ -273,16 +297,26 @@ public:
|
|||
Value(json::Object &&Properties) : Type(T_Object) {
|
||||
create<json::Object>(std::move(Properties));
|
||||
}
|
||||
// Strings: types with value semantics.
|
||||
Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
|
||||
Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
|
||||
Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
|
||||
create<std::string>(V.begin(), V.end());
|
||||
// Strings: types with value semantics. Must be valid UTF-8.
|
||||
Value(std::string V) : Type(T_String) {
|
||||
if (LLVM_UNLIKELY(!isUTF8(V))) {
|
||||
assert(false && "Invalid UTF-8 in value used as JSON");
|
||||
V = fixUTF8(std::move(V));
|
||||
}
|
||||
create<std::string>(std::move(V));
|
||||
}
|
||||
Value(const llvm::SmallVectorImpl<char> &V)
|
||||
: Value(std::string(V.begin(), V.end())){};
|
||||
Value(const llvm::formatv_object_base &V) : Value(V.str()){};
|
||||
// Strings: types with reference semantics.
|
||||
Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
|
||||
Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
|
||||
// Strings: types with reference semantics. Must be valid UTF-8.
|
||||
Value(StringRef V) : Type(T_StringRef) {
|
||||
create<llvm::StringRef>(V);
|
||||
if (LLVM_UNLIKELY(!isUTF8(V))) {
|
||||
assert(false && "Invalid UTF-8 in value used as JSON");
|
||||
*this = Value(fixUTF8(V));
|
||||
}
|
||||
}
|
||||
Value(const char *V) : Value(StringRef(V)) {}
|
||||
Value(std::nullptr_t) : Type(T_Null) {}
|
||||
// Boolean (disallow implicit conversions).
|
||||
// (The last template parameter is a dummy to keep templates distinct.)
|
||||
|
@ -449,13 +483,23 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Value &);
|
|||
/// ObjectKey is a used to capture keys in Object. Like Value but:
|
||||
/// - only strings are allowed
|
||||
/// - it's optimized for the string literal case (Owned == nullptr)
|
||||
/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
|
||||
class ObjectKey {
|
||||
public:
|
||||
ObjectKey(const char *S) : Data(S) {}
|
||||
ObjectKey(llvm::StringRef S) : Data(S) {}
|
||||
ObjectKey(std::string &&V)
|
||||
: Owned(new std::string(std::move(V))), Data(*Owned) {}
|
||||
ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
|
||||
ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
|
||||
ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
|
||||
if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
|
||||
assert(false && "Invalid UTF-8 in value used as JSON");
|
||||
*Owned = fixUTF8(std::move(*Owned));
|
||||
}
|
||||
Data = *Owned;
|
||||
}
|
||||
ObjectKey(llvm::StringRef S) : Data(S) {
|
||||
if (LLVM_UNLIKELY(!isUTF8(Data))) {
|
||||
assert(false && "Invalid UTF-8 in value used as JSON");
|
||||
*this = ObjectKey(fixUTF8(S));
|
||||
}
|
||||
}
|
||||
ObjectKey(const llvm::SmallVectorImpl<char> &V)
|
||||
: ObjectKey(std::string(V.begin(), V.end())) {}
|
||||
ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
//===---------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/JSON.h"
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "llvm/Support/Format.h"
|
||||
#include <cctype>
|
||||
|
||||
|
@ -199,6 +200,14 @@ public:
|
|||
Parser(StringRef JSON)
|
||||
: Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
|
||||
|
||||
bool checkUTF8() {
|
||||
size_t ErrOffset;
|
||||
if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
|
||||
return true;
|
||||
P = Start + ErrOffset; // For line/column calculation.
|
||||
return parseError("Invalid UTF-8 sequence");
|
||||
}
|
||||
|
||||
bool parseValue(Value &Out);
|
||||
|
||||
bool assertEnd() {
|
||||
|
@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) {
|
|||
|
||||
// Case 3: it's a leading surrogate. We expect a trailing one next.
|
||||
// Case 3a: there's no trailing \u escape. Don't advance in the stream.
|
||||
if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
|
||||
if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
|
||||
Invalid(); // Leading surrogate was unpaired.
|
||||
return true;
|
||||
}
|
||||
|
@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) {
|
|||
Expected<Value> parse(StringRef JSON) {
|
||||
Parser P(JSON);
|
||||
Value E = nullptr;
|
||||
if (P.parseValue(E))
|
||||
if (P.assertEnd())
|
||||
return std::move(E);
|
||||
if (P.checkUTF8())
|
||||
if (P.parseValue(E))
|
||||
if (P.assertEnd())
|
||||
return std::move(E);
|
||||
return P.takeError();
|
||||
}
|
||||
char ParseError::ID = 0;
|
||||
|
@ -514,6 +524,37 @@ static std::vector<const Object::value_type *> sortedElements(const Object &O) {
|
|||
return Elements;
|
||||
}
|
||||
|
||||
bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
|
||||
// Fast-path for ASCII, which is valid UTF-8.
|
||||
if (LLVM_LIKELY(isASCII(S)))
|
||||
return true;
|
||||
|
||||
const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
|
||||
if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
|
||||
return true;
|
||||
|
||||
if (ErrOffset)
|
||||
*ErrOffset = Rest - Data;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string fixUTF8(llvm::StringRef S) {
|
||||
// This isn't particularly efficient, but is only for error-recovery.
|
||||
std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
|
||||
const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
|
||||
UTF32 *Out32 = Codepoints.data();
|
||||
ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
|
||||
lenientConversion);
|
||||
Codepoints.resize(Out32 - Codepoints.data());
|
||||
std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
|
||||
const UTF32 *In32 = Codepoints.data();
|
||||
UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
|
||||
ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
|
||||
strictConversion);
|
||||
Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
|
||||
return Res;
|
||||
}
|
||||
|
||||
} // namespace json
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
@ -27,6 +27,14 @@ TEST(JSONTest, Types) {
|
|||
EXPECT_EQ(R"("foo")", s("foo"));
|
||||
EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
|
||||
EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
|
||||
|
||||
#ifdef NDEBUG
|
||||
EXPECT_EQ(R"("<EFBFBD><EFBFBD>")", s("\xC0\x80"));
|
||||
EXPECT_EQ(R"({"<EFBFBD><EFBFBD>":0})", s(Object{{"\xC0\x80", 0}}));
|
||||
#else
|
||||
EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
|
||||
EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(JSONTest, Constructors) {
|
||||
|
@ -181,6 +189,31 @@ TEST(JSONTest, ParseErrors) {
|
|||
"valid": 1,
|
||||
invalid: 2
|
||||
})");
|
||||
ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
|
||||
}
|
||||
|
||||
// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
|
||||
TEST(JSONTest, UTF8) {
|
||||
for (const char *Valid : {
|
||||
"this is ASCII text",
|
||||
"thïs tëxt häs BMP chäräctërs",
|
||||
"𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃",
|
||||
}) {
|
||||
EXPECT_TRUE(isUTF8(Valid)) << Valid;
|
||||
EXPECT_EQ(fixUTF8(Valid), Valid);
|
||||
}
|
||||
for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
|
||||
{"lone trailing \x81\x82 bytes", "lone trailing <20><> bytes"},
|
||||
{"missing trailing \xD0 bytes", "missing trailing <20> bytes"},
|
||||
{"truncated character \xD0", "truncated character <20>"},
|
||||
{"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
|
||||
"not <20><> the <20><><EFBFBD> shortest <20><><EFBFBD><EFBFBD> encoding"},
|
||||
{"too \xF9\x80\x80\x80\x80 long", "too <20><><EFBFBD><EFBFBD><EFBFBD> long"},
|
||||
{"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
|
||||
"surrogate <20><><EFBFBD> invalid <20><><EFBFBD><EFBFBD>"}}) {
|
||||
EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
|
||||
EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(JSONTest, Inspection) {
|
||||
|
|
Loading…
Reference in New Issue