forked from OSchip/llvm-project
[YAMLIO] Support non-null-terminated inputs
In some places the parser guards against dereferencing `End`, while in others it relies on the presence of a trailing `'\0'` to elide checks. Add the remaining guards needed to ensure the parser never attempts to dereference `End`, making it safe to not require a null-terminated input buffer. Update the parser fuzzer harness so that it tests with buffers that are guaranteed to be non-null-terminated, null-terminated, and 1-terminated, additionally ensuring the result of the parse is the same in each case. Some of the regression tests were written by inspection, and some are cases caught by the fuzzer which required additional fixes in the parser. Differential Revision: https://reviews.llvm.org/D84050
This commit is contained in:
parent
544cb649d7
commit
2980933d85
|
@ -200,13 +200,12 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
|
|||
StringRef::iterator End = Range.end();
|
||||
// 1 byte: [0x00, 0x7f]
|
||||
// Bit pattern: 0xxxxxxx
|
||||
if ((*Position & 0x80) == 0) {
|
||||
return std::make_pair(*Position, 1);
|
||||
if (Position < End && (*Position & 0x80) == 0) {
|
||||
return std::make_pair(*Position, 1);
|
||||
}
|
||||
// 2 bytes: [0x80, 0x7ff]
|
||||
// Bit pattern: 110xxxxx 10xxxxxx
|
||||
if (Position + 1 != End &&
|
||||
((*Position & 0xE0) == 0xC0) &&
|
||||
if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
|
||||
((*(Position + 1) & 0xC0) == 0x80)) {
|
||||
uint32_t codepoint = ((*Position & 0x1F) << 6) |
|
||||
(*(Position + 1) & 0x3F);
|
||||
|
@ -215,8 +214,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
|
|||
}
|
||||
// 3 bytes: [0x8000, 0xffff]
|
||||
// Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
|
||||
if (Position + 2 != End &&
|
||||
((*Position & 0xF0) == 0xE0) &&
|
||||
if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
|
||||
((*(Position + 1) & 0xC0) == 0x80) &&
|
||||
((*(Position + 2) & 0xC0) == 0x80)) {
|
||||
uint32_t codepoint = ((*Position & 0x0F) << 12) |
|
||||
|
@ -230,8 +228,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
|
|||
}
|
||||
// 4 bytes: [0x10000, 0x10FFFF]
|
||||
// Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
if (Position + 3 != End &&
|
||||
((*Position & 0xF8) == 0xF0) &&
|
||||
if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
|
||||
((*(Position + 1) & 0xC0) == 0x80) &&
|
||||
((*(Position + 2) & 0xC0) == 0x80) &&
|
||||
((*(Position + 3) & 0xC0) == 0x80)) {
|
||||
|
@ -773,7 +770,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
|
|||
IsSimpleKeyAllowed = true;
|
||||
Failed = false;
|
||||
std::unique_ptr<MemoryBuffer> InputBufferOwner =
|
||||
MemoryBuffer::getMemBuffer(Buffer);
|
||||
MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
|
||||
SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
|
||||
}
|
||||
|
||||
|
@ -1036,7 +1033,7 @@ bool Scanner::rollIndent( int ToColumn
|
|||
}
|
||||
|
||||
void Scanner::skipComment() {
|
||||
if (*Current != '#')
|
||||
if (Current == End || *Current != '#')
|
||||
return;
|
||||
while (true) {
|
||||
// This may skip more than one byte, thus Column is only incremented
|
||||
|
@ -1051,7 +1048,7 @@ void Scanner::skipComment() {
|
|||
|
||||
void Scanner::scanToNextToken() {
|
||||
while (true) {
|
||||
while (*Current == ' ' || *Current == '\t') {
|
||||
while (Current != End && (*Current == ' ' || *Current == '\t')) {
|
||||
skip(1);
|
||||
}
|
||||
|
||||
|
@ -1286,7 +1283,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
|
|||
&& wasEscaped(Start + 1, Current));
|
||||
} else {
|
||||
skip(1);
|
||||
while (true) {
|
||||
while (Current != End) {
|
||||
// Skip a ' followed by another '.
|
||||
if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
|
||||
skip(2);
|
||||
|
@ -1334,13 +1331,14 @@ bool Scanner::scanPlainScalar() {
|
|||
unsigned LeadingBlanks = 0;
|
||||
assert(Indent >= -1 && "Indent must be >= -1 !");
|
||||
unsigned indent = static_cast<unsigned>(Indent + 1);
|
||||
while (true) {
|
||||
while (Current != End) {
|
||||
if (*Current == '#')
|
||||
break;
|
||||
|
||||
while (!isBlankOrBreak(Current)) {
|
||||
if ( FlowLevel && *Current == ':'
|
||||
&& !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
|
||||
while (Current != End && !isBlankOrBreak(Current)) {
|
||||
if (FlowLevel && *Current == ':' &&
|
||||
(Current + 1 == End ||
|
||||
!(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
|
||||
setError("Found unexpected ':' while scanning a plain scalar", Current);
|
||||
return false;
|
||||
}
|
||||
|
@ -1410,7 +1408,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
|
|||
StringRef::iterator Start = Current;
|
||||
unsigned ColStart = Column;
|
||||
skip(1);
|
||||
while(true) {
|
||||
while (Current != End) {
|
||||
if ( *Current == '[' || *Current == ']'
|
||||
|| *Current == '{' || *Current == '}'
|
||||
|| *Current == ','
|
||||
|
|
|
@ -21,12 +21,32 @@ static bool isValidYaml(const uint8_t *Data, size_t Size) {
|
|||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
|
||||
std::vector<uint8_t> Input(Data, Data + Size);
|
||||
|
||||
// Ensure we don't crash on any arbitrary byte string.
|
||||
isValidYaml(Input.data(), Input.size());
|
||||
|
||||
// Ensure we don't crash on byte strings with no null characters.
|
||||
Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
|
||||
Input.shrink_to_fit();
|
||||
bool IsValidWithout0s = isValidYaml(Input.data(), Input.size());
|
||||
|
||||
// Ensure we don't crash on byte strings where the only null character is
|
||||
// one-past-the-end of the actual input to the parser.
|
||||
Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
|
||||
Input.push_back(0);
|
||||
Input.shrink_to_fit();
|
||||
isValidYaml(Input.data(), Input.size() - 1);
|
||||
bool IsValidWhen0Terminated = isValidYaml(Input.data(), Input.size() - 1);
|
||||
|
||||
// Ensure we don't crash on byte strings with no null characters, but with
|
||||
// an invalid character one-past-the-end of the actual input to the parser.
|
||||
Input.back() = 1;
|
||||
bool IsValidWhen1Terminated = isValidYaml(Input.data(), Input.size() - 1);
|
||||
|
||||
// The parser should either accept all of these inputs, or reject all of
|
||||
// them, because the parser sees an identical byte string in each case. This
|
||||
// should hopefully catch some cases where the parser is sensitive to what is
|
||||
// present one-past-the-end of the actual input.
|
||||
if (IsValidWithout0s != IsValidWhen0Terminated ||
|
||||
IsValidWhen0Terminated != IsValidWhen1Terminated)
|
||||
LLVM_BUILTIN_TRAP;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -3111,5 +3111,95 @@ TEST(YAMLIO, TestEmptyAlias) {
|
|||
TEST(YAMLIO, TestEmptyAnchor) {
|
||||
Input yin("*");
|
||||
EXPECT_FALSE(yin.setCurrentDocument());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullEmpty) {
|
||||
std::vector<char> str{};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_FALSE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullSequenceOfNull) {
|
||||
std::vector<char> str{'-'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_FALSE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullSimpleSequence) {
|
||||
std::vector<char> str{'-', ' ', 'a'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_FALSE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullUnbalancedMap) {
|
||||
std::vector<char> str{'{'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullEmptyMap) {
|
||||
std::vector<char> str{'{', '}'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_FALSE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullUnbalancedSequence) {
|
||||
std::vector<char> str{'['};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullEmptySequence) {
|
||||
std::vector<char> str{'[', ']'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_FALSE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullScalarUnbalancedDoubleQuote) {
|
||||
std::vector<char> str{'"'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullScalarUnbalancedSingleQuote) {
|
||||
std::vector<char> str{'\''};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullEmptyAlias) {
|
||||
std::vector<char> str{'&'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullEmptyAnchor) {
|
||||
std::vector<char> str{'*'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullDecodeInvalidUTF8) {
|
||||
std::vector<char> str{'\xef'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestScannerNoNullScanPlainScalarInFlow) {
|
||||
std::vector<char> str{'{', 'a', ':'};
|
||||
Input yin(llvm::StringRef(str.data(), str.size()));
|
||||
yin.setCurrentDocument();
|
||||
EXPECT_TRUE(yin.error());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue