[Support] Harded JSON against invalid UTF-8.

Parsing invalid UTF-8 input is now a parse error.
Creating JSON values from invalid UTF-8 now triggers an assertion, and
(in no-assert builds) substitutes the unicode replacement character.
Strings retrieved from json::Value are always valid UTF-8.

llvm-svn: 336657
This commit is contained in:
Sam McCall 2018-07-10 11:51:26 +00:00
parent ce5c19b623
commit e6057bc689
4 changed files with 146 additions and 17 deletions

View File

@ -88,6 +88,17 @@ inline bool isAlpha(char C) {
/// lowercase letter as classified by "C" locale.
inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
/// Checks whether character \p C is valid ASCII (high bit is zero).
inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
/// Checks whether all characters in S are ASCII.
inline bool isASCII(llvm::StringRef S) {
for (char C : S)
if (LLVM_UNLIKELY(!isASCII(C)))
return false;
return true;
}
/// Returns the corresponding lowercase character if \p x is uppercase.
inline char toLower(char x) {
if (x >= 'A' && x <= 'Z')

View File

@ -54,6 +54,30 @@
namespace llvm {
namespace json {
// === String encodings ===
//
// JSON strings are character sequences (not byte sequences like std::string).
// We need to know the encoding, and for simplicity only support UTF-8.
//
// - When parsing, invalid UTF-8 is a syntax error like any other
//
// - When creating Values from strings, callers must ensure they are UTF-8.
// with asserts on, invalid UTF-8 will crash the program
// with asserts off, we'll substitute the replacement character (U+FFFD)
// Callers can use json::isUTF8() and json::fixUTF8() for validation.
//
// - When retrieving strings from Values (e.g. asString()), the result will
// always be valid UTF-8.
/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
/// If it returns false, \p Offset is set to a byte offset near the first error.
bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
/// Replaces invalid UTF-8 sequences in \p S with the replacement character
/// (U+FFFD). The returned string is valid UTF-8.
/// This is much slower than isUTF8, so test that first.
std::string fixUTF8(llvm::StringRef S);
class Array;
class ObjectKey;
class Value;
@ -273,16 +297,26 @@ public:
Value(json::Object &&Properties) : Type(T_Object) {
create<json::Object>(std::move(Properties));
}
// Strings: types with value semantics.
Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
create<std::string>(V.begin(), V.end());
// Strings: types with value semantics. Must be valid UTF-8.
Value(std::string V) : Type(T_String) {
if (LLVM_UNLIKELY(!isUTF8(V))) {
assert(false && "Invalid UTF-8 in value used as JSON");
V = fixUTF8(std::move(V));
}
create<std::string>(std::move(V));
}
Value(const llvm::SmallVectorImpl<char> &V)
: Value(std::string(V.begin(), V.end())){};
Value(const llvm::formatv_object_base &V) : Value(V.str()){};
// Strings: types with reference semantics.
Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
// Strings: types with reference semantics. Must be valid UTF-8.
Value(StringRef V) : Type(T_StringRef) {
create<llvm::StringRef>(V);
if (LLVM_UNLIKELY(!isUTF8(V))) {
assert(false && "Invalid UTF-8 in value used as JSON");
*this = Value(fixUTF8(V));
}
}
Value(const char *V) : Value(StringRef(V)) {}
Value(std::nullptr_t) : Type(T_Null) {}
// Boolean (disallow implicit conversions).
// (The last template parameter is a dummy to keep templates distinct.)
@ -449,13 +483,23 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Value &);
/// ObjectKey is a used to capture keys in Object. Like Value but:
/// - only strings are allowed
/// - it's optimized for the string literal case (Owned == nullptr)
/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
class ObjectKey {
public:
ObjectKey(const char *S) : Data(S) {}
ObjectKey(llvm::StringRef S) : Data(S) {}
ObjectKey(std::string &&V)
: Owned(new std::string(std::move(V))), Data(*Owned) {}
ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
assert(false && "Invalid UTF-8 in value used as JSON");
*Owned = fixUTF8(std::move(*Owned));
}
Data = *Owned;
}
ObjectKey(llvm::StringRef S) : Data(S) {
if (LLVM_UNLIKELY(!isUTF8(Data))) {
assert(false && "Invalid UTF-8 in value used as JSON");
*this = ObjectKey(fixUTF8(S));
}
}
ObjectKey(const llvm::SmallVectorImpl<char> &V)
: ObjectKey(std::string(V.begin(), V.end())) {}
ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}

View File

@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "llvm/Support/JSON.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include <cctype>
@ -199,6 +200,14 @@ public:
Parser(StringRef JSON)
: Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
bool checkUTF8() {
size_t ErrOffset;
if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
return true;
P = Start + ErrOffset; // For line/column calculation.
return parseError("Invalid UTF-8 sequence");
}
bool parseValue(Value &Out);
bool assertEnd() {
@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) {
// Case 3: it's a leading surrogate. We expect a trailing one next.
// Case 3a: there's no trailing \u escape. Don't advance in the stream.
if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
Invalid(); // Leading surrogate was unpaired.
return true;
}
@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) {
Expected<Value> parse(StringRef JSON) {
Parser P(JSON);
Value E = nullptr;
if (P.parseValue(E))
if (P.assertEnd())
return std::move(E);
if (P.checkUTF8())
if (P.parseValue(E))
if (P.assertEnd())
return std::move(E);
return P.takeError();
}
char ParseError::ID = 0;
@ -514,6 +524,37 @@ static std::vector<const Object::value_type *> sortedElements(const Object &O) {
return Elements;
}
bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
// Fast-path for ASCII, which is valid UTF-8.
if (LLVM_LIKELY(isASCII(S)))
return true;
const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
return true;
if (ErrOffset)
*ErrOffset = Rest - Data;
return false;
}
std::string fixUTF8(llvm::StringRef S) {
// This isn't particularly efficient, but is only for error-recovery.
std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
UTF32 *Out32 = Codepoints.data();
ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
lenientConversion);
Codepoints.resize(Out32 - Codepoints.data());
std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
const UTF32 *In32 = Codepoints.data();
UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
strictConversion);
Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
return Res;
}
} // namespace json
} // namespace llvm

View File

@ -27,6 +27,14 @@ TEST(JSONTest, Types) {
EXPECT_EQ(R"("foo")", s("foo"));
EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
#ifdef NDEBUG
EXPECT_EQ(R"("<EFBFBD><EFBFBD>")", s("\xC0\x80"));
EXPECT_EQ(R"({"<EFBFBD><EFBFBD>":0})", s(Object{{"\xC0\x80", 0}}));
#else
EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
#endif
}
TEST(JSONTest, Constructors) {
@ -181,6 +189,31 @@ TEST(JSONTest, ParseErrors) {
"valid": 1,
invalid: 2
})");
ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
}
// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
TEST(JSONTest, UTF8) {
for (const char *Valid : {
"this is ASCII text",
"thïs tëxt häs BMP chäräctërs",
"𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃",
}) {
EXPECT_TRUE(isUTF8(Valid)) << Valid;
EXPECT_EQ(fixUTF8(Valid), Valid);
}
for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
{"lone trailing \x81\x82 bytes", "lone trailing <20><> bytes"},
{"missing trailing \xD0 bytes", "missing trailing <20> bytes"},
{"truncated character \xD0", "truncated character <20>"},
{"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
"not <20><> the <20><><EFBFBD> shortest <20><><EFBFBD><EFBFBD> encoding"},
{"too \xF9\x80\x80\x80\x80 long", "too <20><><EFBFBD><EFBFBD><EFBFBD> long"},
{"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
"surrogate <20><><EFBFBD> invalid <20><><EFBFBD><EFBFBD>"}}) {
EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
}
}
TEST(JSONTest, Inspection) {