llvm-project/llvm/lib/Support/YAMLParser.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2471 lines
69 KiB
C++
Raw Normal View History

//===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a YAML parser.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/YAMLParser.h"
#include "llvm/ADT/AllocatorList.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
#include <string>
#include <system_error>
#include <utility>
using namespace llvm;
using namespace yaml;
enum UnicodeEncodingForm {
UEF_UTF32_LE, ///< UTF-32 Little Endian
UEF_UTF32_BE, ///< UTF-32 Big Endian
UEF_UTF16_LE, ///< UTF-16 Little Endian
UEF_UTF16_BE, ///< UTF-16 Big Endian
UEF_UTF8, ///< UTF-8 or ascii.
UEF_Unknown ///< Not a valid Unicode encoding.
};
/// EncodingInfo - Holds the encoding type and length of the byte order mark if
/// it exists. Length is in {0, 2, 3, 4}.
using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
/// encoding form of \a Input.
///
/// @param Input A string of length 0 or more.
/// @returns An EncodingInfo indicating the Unicode encoding form of the input
/// and how long the byte order mark is if one exists.
static EncodingInfo getUnicodeEncoding(StringRef Input) {
if (Input.empty())
return std::make_pair(UEF_Unknown, 0);
switch (uint8_t(Input[0])) {
case 0x00:
if (Input.size() >= 4) {
if ( Input[1] == 0
&& uint8_t(Input[2]) == 0xFE
&& uint8_t(Input[3]) == 0xFF)
return std::make_pair(UEF_UTF32_BE, 4);
if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
return std::make_pair(UEF_UTF32_BE, 0);
}
if (Input.size() >= 2 && Input[1] != 0)
return std::make_pair(UEF_UTF16_BE, 0);
return std::make_pair(UEF_Unknown, 0);
case 0xFF:
if ( Input.size() >= 4
&& uint8_t(Input[1]) == 0xFE
&& Input[2] == 0
&& Input[3] == 0)
return std::make_pair(UEF_UTF32_LE, 4);
if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
return std::make_pair(UEF_UTF16_LE, 2);
return std::make_pair(UEF_Unknown, 0);
case 0xFE:
if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
return std::make_pair(UEF_UTF16_BE, 2);
return std::make_pair(UEF_Unknown, 0);
case 0xEF:
if ( Input.size() >= 3
&& uint8_t(Input[1]) == 0xBB
&& uint8_t(Input[2]) == 0xBF)
return std::make_pair(UEF_UTF8, 3);
return std::make_pair(UEF_Unknown, 0);
}
// It could still be utf-32 or utf-16.
if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
return std::make_pair(UEF_UTF32_LE, 0);
if (Input.size() >= 2 && Input[1] == 0)
return std::make_pair(UEF_UTF16_LE, 0);
return std::make_pair(UEF_UTF8, 0);
}
/// Pin the vtables to this file.
void Node::anchor() {}
void NullNode::anchor() {}
void ScalarNode::anchor() {}
void BlockScalarNode::anchor() {}
void KeyValueNode::anchor() {}
void MappingNode::anchor() {}
void SequenceNode::anchor() {}
void AliasNode::anchor() {}
namespace llvm {
namespace yaml {
/// Token - A single YAML token.
struct Token {
enum TokenKind {
TK_Error, // Uninitialized token.
TK_StreamStart,
TK_StreamEnd,
TK_VersionDirective,
TK_TagDirective,
TK_DocumentStart,
TK_DocumentEnd,
TK_BlockEntry,
TK_BlockEnd,
TK_BlockSequenceStart,
TK_BlockMappingStart,
TK_FlowEntry,
TK_FlowSequenceStart,
TK_FlowSequenceEnd,
TK_FlowMappingStart,
TK_FlowMappingEnd,
TK_Key,
TK_Value,
TK_Scalar,
TK_BlockScalar,
TK_Alias,
TK_Anchor,
TK_Tag
} Kind = TK_Error;
/// A string of length 0 or more whose begin() points to the logical location
/// of the token in the input.
StringRef Range;
/// The value of a block scalar node.
std::string Value;
Token() = default;
};
} // end namespace yaml
} // end namespace llvm
using TokenQueueT = BumpPtrList<Token>;
namespace {
/// This struct is used to track simple keys.
///
/// Simple keys are handled by creating an entry in SimpleKeys for each Token
/// which could legally be the start of a simple key. When peekNext is called,
/// if the Token To be returned is referenced by a SimpleKey, we continue
/// tokenizing until that potential simple key has either been found to not be
/// a simple key (we moved on to the next line or went further than 1024 chars).
/// Or when we run into a Value, and then insert a Key token (and possibly
/// others) before the SimpleKey's Tok.
struct SimpleKey {
TokenQueueT::iterator Tok;
unsigned Column = 0;
unsigned Line = 0;
unsigned FlowLevel = 0;
bool IsRequired = false;
bool operator ==(const SimpleKey &Other) {
return Tok == Other.Tok;
}
};
} // end anonymous namespace
/// The Unicode scalar value of a UTF-8 minimal well-formed code unit
/// subsequence and the subsequence's length in code units (uint8_t).
/// A length of 0 represents an error.
using UTF8Decoded = std::pair<uint32_t, unsigned>;
static UTF8Decoded decodeUTF8(StringRef Range) {
StringRef::iterator Position= Range.begin();
StringRef::iterator End = Range.end();
// 1 byte: [0x00, 0x7f]
// Bit pattern: 0xxxxxxx
if ((*Position & 0x80) == 0) {
return std::make_pair(*Position, 1);
}
// 2 bytes: [0x80, 0x7ff]
// Bit pattern: 110xxxxx 10xxxxxx
if (Position + 1 != End &&
((*Position & 0xE0) == 0xC0) &&
((*(Position + 1) & 0xC0) == 0x80)) {
uint32_t codepoint = ((*Position & 0x1F) << 6) |
(*(Position + 1) & 0x3F);
if (codepoint >= 0x80)
return std::make_pair(codepoint, 2);
}
// 3 bytes: [0x8000, 0xffff]
// Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
if (Position + 2 != End &&
((*Position & 0xF0) == 0xE0) &&
((*(Position + 1) & 0xC0) == 0x80) &&
((*(Position + 2) & 0xC0) == 0x80)) {
uint32_t codepoint = ((*Position & 0x0F) << 12) |
((*(Position + 1) & 0x3F) << 6) |
(*(Position + 2) & 0x3F);
// Codepoints between 0xD800 and 0xDFFF are invalid, as
// they are high / low surrogate halves used by UTF-16.
if (codepoint >= 0x800 &&
(codepoint < 0xD800 || codepoint > 0xDFFF))
return std::make_pair(codepoint, 3);
}
// 4 bytes: [0x10000, 0x10FFFF]
// Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (Position + 3 != End &&
((*Position & 0xF8) == 0xF0) &&
((*(Position + 1) & 0xC0) == 0x80) &&
((*(Position + 2) & 0xC0) == 0x80) &&
((*(Position + 3) & 0xC0) == 0x80)) {
uint32_t codepoint = ((*Position & 0x07) << 18) |
((*(Position + 1) & 0x3F) << 12) |
((*(Position + 2) & 0x3F) << 6) |
(*(Position + 3) & 0x3F);
if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
return std::make_pair(codepoint, 4);
}
return std::make_pair(0, 0);
}
namespace llvm {
namespace yaml {
/// Scans YAML tokens from a MemoryBuffer.
class Scanner {
public:
Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
std::error_code *EC = nullptr);
Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
std::error_code *EC = nullptr);
/// Parse the next token and return it without popping it.
Token &peekNext();
/// Parse the next token and pop it from the queue.
Token getNext();
void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
ArrayRef<SMRange> Ranges = None) {
SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
}
void setError(const Twine &Message, StringRef::iterator Position) {
if (Current >= End)
Current = End - 1;
// propagate the error if possible
if (EC)
*EC = make_error_code(std::errc::invalid_argument);
// Don't print out more errors after the first one we encounter. The rest
// are just the result of the first, and have no meaning.
if (!Failed)
printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
Failed = true;
}
void setError(const Twine &Message) {
setError(Message, Current);
}
/// Returns true if an error occurred while parsing.
bool failed() {
return Failed;
}
private:
void init(MemoryBufferRef Buffer);
StringRef currentInput() {
return StringRef(Current, End - Current);
}
/// Decode a UTF-8 minimal well-formed code unit subsequence starting
/// at \a Position.
///
/// If the UTF-8 code units starting at Position do not form a well-formed
/// code unit subsequence, then the Unicode scalar value is 0, and the length
/// is 0.
UTF8Decoded decodeUTF8(StringRef::iterator Position) {
return ::decodeUTF8(StringRef(Position, End - Position));
}
// The following functions are based on the gramar rules in the YAML spec. The
// style of the function names it meant to closely match how they are written
// in the spec. The number within the [] is the number of the grammar rule in
// the spec.
//
// See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
//
// c-
// A production starting and ending with a special character.
// b-
// A production matching a single line break.
// nb-
// A production starting and ending with a non-break character.
// s-
// A production starting and ending with a white space character.
// ns-
// A production starting and ending with a non-space character.
// l-
// A production matching complete line(s).
/// Skip a single nb-char[27] starting at Position.
///
/// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
/// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
///
/// @returns The code unit after the nb-char, or Position if it's not an
/// nb-char.
StringRef::iterator skip_nb_char(StringRef::iterator Position);
/// Skip a single b-break[28] starting at Position.
///
/// A b-break is 0xD 0xA | 0xD | 0xA
///
/// @returns The code unit after the b-break, or Position if it's not a
/// b-break.
StringRef::iterator skip_b_break(StringRef::iterator Position);
/// Skip a single s-space[31] starting at Position.
///
/// An s-space is 0x20
///
/// @returns The code unit after the s-space, or Position if it's not a
/// s-space.
StringRef::iterator skip_s_space(StringRef::iterator Position);
/// Skip a single s-white[33] starting at Position.
///
/// A s-white is 0x20 | 0x9
///
/// @returns The code unit after the s-white, or Position if it's not a
/// s-white.
StringRef::iterator skip_s_white(StringRef::iterator Position);
/// Skip a single ns-char[34] starting at Position.
///
/// A ns-char is nb-char - s-white
///
/// @returns The code unit after the ns-char, or Position if it's not a
/// ns-char.
StringRef::iterator skip_ns_char(StringRef::iterator Position);
using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
/// Skip minimal well-formed code unit subsequences until Func
/// returns its input.
///
/// @returns The code unit after the last minimal well-formed code unit
/// subsequence that Func accepted.
StringRef::iterator skip_while( SkipWhileFunc Func
, StringRef::iterator Position);
/// Skip minimal well-formed code unit subsequences until Func returns its
/// input.
void advanceWhile(SkipWhileFunc Func);
/// Scan ns-uri-char[39]s starting at Cur.
///
/// This updates Cur and Column while scanning.
void scan_ns_uri_char();
/// Consume a minimal well-formed code unit subsequence starting at
/// \a Cur. Return false if it is not the same Unicode scalar value as
/// \a Expected. This updates \a Column.
bool consume(uint32_t Expected);
/// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
void skip(uint32_t Distance);
/// Return true if the minimal well-formed code unit subsequence at
/// Pos is whitespace or a new line
bool isBlankOrBreak(StringRef::iterator Position);
/// Consume a single b-break[28] if it's present at the current position.
///
/// Return false if the code unit at the current position isn't a line break.
bool consumeLineBreakIfPresent();
/// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
, unsigned AtColumn
, bool IsRequired);
/// Remove simple keys that can no longer be valid simple keys.
///
/// Invalid simple keys are not on the current line or are further than 1024
/// columns back.
void removeStaleSimpleKeyCandidates();
/// Remove all simple keys on FlowLevel \a Level.
void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
/// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
/// tokens if needed.
bool unrollIndent(int ToColumn);
/// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
/// if needed.
bool rollIndent( int ToColumn
, Token::TokenKind Kind
, TokenQueueT::iterator InsertPoint);
/// Skip a single-line comment when the comment starts at the current
/// position of the scanner.
void skipComment();
/// Skip whitespace and comments until the start of the next token.
void scanToNextToken();
/// Must be the first token generated.
bool scanStreamStart();
/// Generate tokens needed to close out the stream.
bool scanStreamEnd();
/// Scan a %BLAH directive.
bool scanDirective();
/// Scan a ... or ---.
bool scanDocumentIndicator(bool IsStart);
/// Scan a [ or { and generate the proper flow collection start token.
bool scanFlowCollectionStart(bool IsSequence);
/// Scan a ] or } and generate the proper flow collection end token.
bool scanFlowCollectionEnd(bool IsSequence);
/// Scan the , that separates entries in a flow collection.
bool scanFlowEntry();
/// Scan the - that starts block sequence entries.
bool scanBlockEntry();
/// Scan an explicit ? indicating a key.
bool scanKey();
/// Scan an explicit : indicating a value.
bool scanValue();
/// Scan a quoted scalar.
bool scanFlowScalar(bool IsDoubleQuoted);
/// Scan an unquoted scalar.
bool scanPlainScalar();
/// Scan an Alias or Anchor starting with * or &.
bool scanAliasOrAnchor(bool IsAlias);
/// Scan a block scalar starting with | or >.
bool scanBlockScalar(bool IsLiteral);
/// Scan a chomping indicator in a block scalar header.
char scanBlockChompingIndicator();
/// Scan an indentation indicator in a block scalar header.
unsigned scanBlockIndentationIndicator();
/// Scan a block scalar header.
///
/// Return false if an error occurred.
bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
bool &IsDone);
/// Look for the indentation level of a block scalar.
///
/// Return false if an error occurred.
bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
unsigned &LineBreaks, bool &IsDone);
/// Scan the indentation of a text line in a block scalar.
///
/// Return false if an error occurred.
bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
bool &IsDone);
/// Scan a tag of the form !stuff.
bool scanTag();
/// Dispatch to the next scanning function based on \a *Cur.
bool fetchMoreTokens();
/// The SourceMgr used for diagnostics and buffer management.
SourceMgr &SM;
/// The original input.
MemoryBufferRef InputBuffer;
/// The current position of the scanner.
StringRef::iterator Current;
/// The end of the input (one past the last character).
StringRef::iterator End;
/// Current YAML indentation level in spaces.
int Indent;
/// Current column number in Unicode code points.
unsigned Column;
/// Current line number.
unsigned Line;
/// How deep we are in flow style containers. 0 Means at block level.
unsigned FlowLevel;
/// Are we at the start of the stream?
bool IsStartOfStream;
/// Can the next token be the start of a simple key?
bool IsSimpleKeyAllowed;
/// True if an error has occurred.
bool Failed;
/// Should colors be used when printing out the diagnostic messages?
bool ShowColors;
/// Queue of tokens. This is required to queue up tokens while looking
/// for the end of a simple key. And for cases where a single character
/// can produce multiple tokens (e.g. BlockEnd).
TokenQueueT TokenQueue;
/// Indentation levels.
SmallVector<int, 4> Indents;
/// Potential simple keys.
SmallVector<SimpleKey, 4> SimpleKeys;
std::error_code *EC;
};
} // end namespace yaml
} // end namespace llvm
/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
static void encodeUTF8( uint32_t UnicodeScalarValue
, SmallVectorImpl<char> &Result) {
if (UnicodeScalarValue <= 0x7F) {
Result.push_back(UnicodeScalarValue & 0x7F);
} else if (UnicodeScalarValue <= 0x7FF) {
uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
Result.push_back(FirstByte);
Result.push_back(SecondByte);
} else if (UnicodeScalarValue <= 0xFFFF) {
uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
Result.push_back(FirstByte);
Result.push_back(SecondByte);
Result.push_back(ThirdByte);
} else if (UnicodeScalarValue <= 0x10FFFF) {
uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
Result.push_back(FirstByte);
Result.push_back(SecondByte);
Result.push_back(ThirdByte);
Result.push_back(FourthByte);
}
}
bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
SourceMgr SM;
Scanner scanner(Input, SM);
while (true) {
Token T = scanner.getNext();
switch (T.Kind) {
case Token::TK_StreamStart:
OS << "Stream-Start: ";
break;
case Token::TK_StreamEnd:
OS << "Stream-End: ";
break;
case Token::TK_VersionDirective:
OS << "Version-Directive: ";
break;
case Token::TK_TagDirective:
OS << "Tag-Directive: ";
break;
case Token::TK_DocumentStart:
OS << "Document-Start: ";
break;
case Token::TK_DocumentEnd:
OS << "Document-End: ";
break;
case Token::TK_BlockEntry:
OS << "Block-Entry: ";
break;
case Token::TK_BlockEnd:
OS << "Block-End: ";
break;
case Token::TK_BlockSequenceStart:
OS << "Block-Sequence-Start: ";
break;
case Token::TK_BlockMappingStart:
OS << "Block-Mapping-Start: ";
break;
case Token::TK_FlowEntry:
OS << "Flow-Entry: ";
break;
case Token::TK_FlowSequenceStart:
OS << "Flow-Sequence-Start: ";
break;
case Token::TK_FlowSequenceEnd:
OS << "Flow-Sequence-End: ";
break;
case Token::TK_FlowMappingStart:
OS << "Flow-Mapping-Start: ";
break;
case Token::TK_FlowMappingEnd:
OS << "Flow-Mapping-End: ";
break;
case Token::TK_Key:
OS << "Key: ";
break;
case Token::TK_Value:
OS << "Value: ";
break;
case Token::TK_Scalar:
OS << "Scalar: ";
break;
case Token::TK_BlockScalar:
OS << "Block Scalar: ";
break;
case Token::TK_Alias:
OS << "Alias: ";
break;
case Token::TK_Anchor:
OS << "Anchor: ";
break;
case Token::TK_Tag:
OS << "Tag: ";
break;
case Token::TK_Error:
break;
}
OS << T.Range << "\n";
if (T.Kind == Token::TK_StreamEnd)
break;
else if (T.Kind == Token::TK_Error)
return false;
}
return true;
}
bool yaml::scanTokens(StringRef Input) {
SourceMgr SM;
Scanner scanner(Input, SM);
while (true) {
Token T = scanner.getNext();
if (T.Kind == Token::TK_StreamEnd)
break;
else if (T.Kind == Token::TK_Error)
return false;
}
return true;
}
std::string yaml::escape(StringRef Input, bool EscapePrintable) {
std::string EscapedInput;
for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
if (*i == '\\')
EscapedInput += "\\\\";
else if (*i == '"')
EscapedInput += "\\\"";
else if (*i == 0)
EscapedInput += "\\0";
else if (*i == 0x07)
EscapedInput += "\\a";
else if (*i == 0x08)
EscapedInput += "\\b";
else if (*i == 0x09)
EscapedInput += "\\t";
else if (*i == 0x0A)
EscapedInput += "\\n";
else if (*i == 0x0B)
EscapedInput += "\\v";
else if (*i == 0x0C)
EscapedInput += "\\f";
else if (*i == 0x0D)
EscapedInput += "\\r";
else if (*i == 0x1B)
EscapedInput += "\\e";
else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
std::string HexStr = utohexstr(*i);
EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
} else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
UTF8Decoded UnicodeScalarValue
= decodeUTF8(StringRef(i, Input.end() - i));
if (UnicodeScalarValue.second == 0) {
// Found invalid char.
SmallString<4> Val;
encodeUTF8(0xFFFD, Val);
EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
// FIXME: Error reporting.
return EscapedInput;
}
if (UnicodeScalarValue.first == 0x85)
EscapedInput += "\\N";
else if (UnicodeScalarValue.first == 0xA0)
EscapedInput += "\\_";
else if (UnicodeScalarValue.first == 0x2028)
EscapedInput += "\\L";
else if (UnicodeScalarValue.first == 0x2029)
EscapedInput += "\\P";
else if (!EscapePrintable &&
sys::unicode::isPrintable(UnicodeScalarValue.first))
EscapedInput += StringRef(i, UnicodeScalarValue.second);
else {
std::string HexStr = utohexstr(UnicodeScalarValue.first);
if (HexStr.size() <= 2)
EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
else if (HexStr.size() <= 4)
EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
else if (HexStr.size() <= 8)
EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
}
i += UnicodeScalarValue.second - 1;
} else
EscapedInput.push_back(*i);
}
return EscapedInput;
}
Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
std::error_code *EC)
: SM(sm), ShowColors(ShowColors), EC(EC) {
init(MemoryBufferRef(Input, "YAML"));
}
Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
std::error_code *EC)
: SM(SM_), ShowColors(ShowColors), EC(EC) {
init(Buffer);
}
void Scanner::init(MemoryBufferRef Buffer) {
InputBuffer = Buffer;
Current = InputBuffer.getBufferStart();
End = InputBuffer.getBufferEnd();
Indent = -1;
Column = 0;
Line = 0;
FlowLevel = 0;
IsStartOfStream = true;
IsSimpleKeyAllowed = true;
Failed = false;
std::unique_ptr<MemoryBuffer> InputBufferOwner =
MemoryBuffer::getMemBuffer(Buffer);
SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
}
Token &Scanner::peekNext() {
// If the current token is a possible simple key, keep parsing until we
// can confirm.
bool NeedMore = false;
while (true) {
if (TokenQueue.empty() || NeedMore) {
if (!fetchMoreTokens()) {
TokenQueue.clear();
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
SimpleKeys.clear();
TokenQueue.push_back(Token());
return TokenQueue.front();
}
}
assert(!TokenQueue.empty() &&
"fetchMoreTokens lied about getting tokens!");
removeStaleSimpleKeyCandidates();
SimpleKey SK;
SK.Tok = TokenQueue.begin();
if (!is_contained(SimpleKeys, SK))
break;
else
NeedMore = true;
}
return TokenQueue.front();
}
Token Scanner::getNext() {
Token Ret = peekNext();
// TokenQueue can be empty if there was an error getting the next token.
if (!TokenQueue.empty())
TokenQueue.pop_front();
// There cannot be any referenced Token's if the TokenQueue is empty. So do a
// quick deallocation of them all.
if (TokenQueue.empty())
TokenQueue.resetAlloc();
return Ret;
}
StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
if (Position == End)
return Position;
// Check 7 bit c-printable - b-char.
if ( *Position == 0x09
|| (*Position >= 0x20 && *Position <= 0x7E))
return Position + 1;
// Check for valid UTF-8.
if (uint8_t(*Position) & 0x80) {
UTF8Decoded u8d = decodeUTF8(Position);
if ( u8d.second != 0
&& u8d.first != 0xFEFF
&& ( u8d.first == 0x85
|| ( u8d.first >= 0xA0
&& u8d.first <= 0xD7FF)
|| ( u8d.first >= 0xE000
&& u8d.first <= 0xFFFD)
|| ( u8d.first >= 0x10000
&& u8d.first <= 0x10FFFF)))
return Position + u8d.second;
}
return Position;
}
StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
if (Position == End)
return Position;
if (*Position == 0x0D) {
if (Position + 1 != End && *(Position + 1) == 0x0A)
return Position + 2;
return Position + 1;
}
if (*Position == 0x0A)
return Position + 1;
return Position;
}
StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
if (Position == End)
return Position;
if (*Position == ' ')
return Position + 1;
return Position;
}
StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
if (Position == End)
return Position;
if (*Position == ' ' || *Position == '\t')
return Position + 1;
return Position;
}
StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
if (Position == End)
return Position;
if (*Position == ' ' || *Position == '\t')
return Position;
return skip_nb_char(Position);
}
StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
, StringRef::iterator Position) {
while (true) {
StringRef::iterator i = (this->*Func)(Position);
if (i == Position)
break;
Position = i;
}
return Position;
}
void Scanner::advanceWhile(SkipWhileFunc Func) {
auto Final = skip_while(Func, Current);
Column += Final - Current;
Current = Final;
}
static bool is_ns_hex_digit(const char C) {
return (C >= '0' && C <= '9')
|| (C >= 'a' && C <= 'z')
|| (C >= 'A' && C <= 'Z');
}
static bool is_ns_word_char(const char C) {
return C == '-'
|| (C >= 'a' && C <= 'z')
|| (C >= 'A' && C <= 'Z');
}
void Scanner::scan_ns_uri_char() {
while (true) {
if (Current == End)
break;
if (( *Current == '%'
&& Current + 2 < End
&& is_ns_hex_digit(*(Current + 1))
&& is_ns_hex_digit(*(Current + 2)))
|| is_ns_word_char(*Current)
|| StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
!= StringRef::npos) {
++Current;
++Column;
} else
break;
}
}
bool Scanner::consume(uint32_t Expected) {
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
if (Expected >= 0x80) {
setError("Cannot consume non-ascii characters");
return false;
}
if (Current == End)
return false;
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
if (uint8_t(*Current) >= 0x80) {
setError("Cannot consume non-ascii characters");
return false;
}
if (uint8_t(*Current) == Expected) {
++Current;
++Column;
return true;
}
return false;
}
void Scanner::skip(uint32_t Distance) {
Current += Distance;
Column += Distance;
assert(Current <= End && "Skipped past the end");
}
bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
if (Position == End)
return false;
return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
*Position == '\n';
}
bool Scanner::consumeLineBreakIfPresent() {
auto Next = skip_b_break(Current);
if (Next == Current)
return false;
Column = 0;
++Line;
Current = Next;
return true;
}
void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
, unsigned AtColumn
, bool IsRequired) {
if (IsSimpleKeyAllowed) {
SimpleKey SK;
SK.Tok = Tok;
SK.Line = Line;
SK.Column = AtColumn;
SK.IsRequired = IsRequired;
SK.FlowLevel = FlowLevel;
SimpleKeys.push_back(SK);
}
}
void Scanner::removeStaleSimpleKeyCandidates() {
for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
i != SimpleKeys.end();) {
if (i->Line != Line || i->Column + 1024 < Column) {
if (i->IsRequired)
setError( "Could not find expected : for simple key"
, i->Tok->Range.begin());
i = SimpleKeys.erase(i);
} else
++i;
}
}
void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
SimpleKeys.pop_back();
}
bool Scanner::unrollIndent(int ToColumn) {
Token T;
// Indentation is ignored in flow.
if (FlowLevel != 0)
return true;
while (Indent > ToColumn) {
T.Kind = Token::TK_BlockEnd;
T.Range = StringRef(Current, 1);
TokenQueue.push_back(T);
Indent = Indents.pop_back_val();
}
return true;
}
bool Scanner::rollIndent( int ToColumn
, Token::TokenKind Kind
, TokenQueueT::iterator InsertPoint) {
if (FlowLevel)
return true;
if (Indent < ToColumn) {
Indents.push_back(Indent);
Indent = ToColumn;
Token T;
T.Kind = Kind;
T.Range = StringRef(Current, 0);
TokenQueue.insert(InsertPoint, T);
}
return true;
}
void Scanner::skipComment() {
if (*Current != '#')
return;
while (true) {
// This may skip more than one byte, thus Column is only incremented
// for code points.
StringRef::iterator I = skip_nb_char(Current);
if (I == Current)
break;
Current = I;
++Column;
}
}
void Scanner::scanToNextToken() {
while (true) {
while (*Current == ' ' || *Current == '\t') {
skip(1);
}
skipComment();
// Skip EOL.
StringRef::iterator i = skip_b_break(Current);
if (i == Current)
break;
Current = i;
++Line;
Column = 0;
// New lines may start a simple key.
if (!FlowLevel)
IsSimpleKeyAllowed = true;
}
}
bool Scanner::scanStreamStart() {
IsStartOfStream = false;
EncodingInfo EI = getUnicodeEncoding(currentInput());
Token T;
T.Kind = Token::TK_StreamStart;
T.Range = StringRef(Current, EI.second);
TokenQueue.push_back(T);
Current += EI.second;
return true;
}
bool Scanner::scanStreamEnd() {
// Force an ending new line if one isn't present.
if (Column != 0) {
Column = 0;
++Line;
}
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
Token T;
T.Kind = Token::TK_StreamEnd;
T.Range = StringRef(Current, 0);
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanDirective() {
// Reset the indentation level.
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
StringRef::iterator Start = Current;
consume('%');
StringRef::iterator NameStart = Current;
Current = skip_while(&Scanner::skip_ns_char, Current);
StringRef Name(NameStart, Current - NameStart);
Current = skip_while(&Scanner::skip_s_white, Current);
Token T;
if (Name == "YAML") {
Current = skip_while(&Scanner::skip_ns_char, Current);
T.Kind = Token::TK_VersionDirective;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
return true;
} else if(Name == "TAG") {
Current = skip_while(&Scanner::skip_ns_char, Current);
Current = skip_while(&Scanner::skip_s_white, Current);
Current = skip_while(&Scanner::skip_ns_char, Current);
T.Kind = Token::TK_TagDirective;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
return true;
}
return false;
}
bool Scanner::scanDocumentIndicator(bool IsStart) {
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
Token T;
T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
T.Range = StringRef(Current, 3);
skip(3);
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanFlowCollectionStart(bool IsSequence) {
Token T;
T.Kind = IsSequence ? Token::TK_FlowSequenceStart
: Token::TK_FlowMappingStart;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
// [ and { may begin a simple key.
saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
// And may also be followed by a simple key.
IsSimpleKeyAllowed = true;
++FlowLevel;
return true;
}
bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = false;
Token T;
T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
: Token::TK_FlowMappingEnd;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
if (FlowLevel)
--FlowLevel;
return true;
}
bool Scanner::scanFlowEntry() {
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = true;
Token T;
T.Kind = Token::TK_FlowEntry;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanBlockEntry() {
rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = true;
Token T;
T.Kind = Token::TK_BlockEntry;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanKey() {
if (!FlowLevel)
rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = !FlowLevel;
Token T;
T.Kind = Token::TK_Key;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanValue() {
// If the previous token could have been a simple key, insert the key token
// into the token queue.
if (!SimpleKeys.empty()) {
SimpleKey SK = SimpleKeys.pop_back_val();
Token T;
T.Kind = Token::TK_Key;
T.Range = SK.Tok->Range;
TokenQueueT::iterator i, e;
for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
if (i == SK.Tok)
break;
}
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
if (i == e) {
Failed = true;
return false;
}
i = TokenQueue.insert(i, T);
// We may also need to add a Block-Mapping-Start token.
rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
IsSimpleKeyAllowed = false;
} else {
if (!FlowLevel)
rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
IsSimpleKeyAllowed = !FlowLevel;
}
Token T;
T.Kind = Token::TK_Value;
T.Range = StringRef(Current, 1);
skip(1);
TokenQueue.push_back(T);
return true;
}
// Forbidding inlining improves performance by roughly 20%.
// FIXME: Remove once llvm optimizes this to the faster version without hints.
LLVM_ATTRIBUTE_NOINLINE static bool
wasEscaped(StringRef::iterator First, StringRef::iterator Position);
// Returns whether a character at 'Position' was escaped with a leading '\'.
// 'First' specifies the position of the first character in the string.
static bool wasEscaped(StringRef::iterator First,
StringRef::iterator Position) {
assert(Position - 1 >= First);
StringRef::iterator I = Position - 1;
// We calculate the number of consecutive '\'s before the current position
// by iterating backwards through our string.
while (I >= First && *I == '\\') --I;
// (Position - 1 - I) now contains the number of '\'s before the current
// position. If it is odd, the character at 'Position' was escaped.
return (Position - 1 - I) % 2 == 1;
}
bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
StringRef::iterator Start = Current;
unsigned ColStart = Column;
if (IsDoubleQuoted) {
do {
++Current;
while (Current != End && *Current != '"')
++Current;
// Repeat until the previous character was not a '\' or was an escaped
// backslash.
} while ( Current != End
&& *(Current - 1) == '\\'
&& wasEscaped(Start + 1, Current));
} else {
skip(1);
while (true) {
// Skip a ' followed by another '.
if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
skip(2);
continue;
} else if (*Current == '\'')
break;
StringRef::iterator i = skip_nb_char(Current);
if (i == Current) {
i = skip_b_break(Current);
if (i == Current)
break;
Current = i;
Column = 0;
++Line;
} else {
if (i == End)
break;
Current = i;
++Column;
}
}
}
if (Current == End) {
setError("Expected quote at end of scalar", Current);
return false;
}
skip(1); // Skip ending quote.
Token T;
T.Kind = Token::TK_Scalar;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
IsSimpleKeyAllowed = false;
return true;
}
bool Scanner::scanPlainScalar() {
StringRef::iterator Start = Current;
unsigned ColStart = Column;
unsigned LeadingBlanks = 0;
assert(Indent >= -1 && "Indent must be >= -1 !");
unsigned indent = static_cast<unsigned>(Indent + 1);
while (true) {
if (*Current == '#')
break;
while (!isBlankOrBreak(Current)) {
if ( FlowLevel && *Current == ':'
&& !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
setError("Found unexpected ':' while scanning a plain scalar", Current);
return false;
}
// Check for the end of the plain scalar.
if ( (*Current == ':' && isBlankOrBreak(Current + 1))
|| ( FlowLevel
&& (StringRef(Current, 1).find_first_of(",:?[]{}")
!= StringRef::npos)))
break;
StringRef::iterator i = skip_nb_char(Current);
if (i == Current)
break;
Current = i;
++Column;
}
// Are we at the end?
if (!isBlankOrBreak(Current))
break;
// Eat blanks.
StringRef::iterator Tmp = Current;
while (isBlankOrBreak(Tmp)) {
StringRef::iterator i = skip_s_white(Tmp);
if (i != Tmp) {
if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
setError("Found invalid tab character in indentation", Tmp);
return false;
}
Tmp = i;
++Column;
} else {
i = skip_b_break(Tmp);
if (!LeadingBlanks)
LeadingBlanks = 1;
Tmp = i;
Column = 0;
++Line;
}
}
if (!FlowLevel && Column < indent)
break;
Current = Tmp;
}
if (Start == Current) {
setError("Got empty plain scalar", Start);
return false;
}
Token T;
T.Kind = Token::TK_Scalar;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
// Plain scalars can be simple keys.
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
IsSimpleKeyAllowed = false;
return true;
}
bool Scanner::scanAliasOrAnchor(bool IsAlias) {
StringRef::iterator Start = Current;
unsigned ColStart = Column;
skip(1);
while(true) {
if ( *Current == '[' || *Current == ']'
|| *Current == '{' || *Current == '}'
|| *Current == ','
|| *Current == ':')
break;
StringRef::iterator i = skip_ns_char(Current);
if (i == Current)
break;
Current = i;
++Column;
}
if (Start == Current) {
setError("Got empty alias or anchor", Start);
return false;
}
Token T;
T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
// Alias and anchors can be simple keys.
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
IsSimpleKeyAllowed = false;
return true;
}
char Scanner::scanBlockChompingIndicator() {
char Indicator = ' ';
if (Current != End && (*Current == '+' || *Current == '-')) {
Indicator = *Current;
skip(1);
}
return Indicator;
}
/// Get the number of line breaks after chomping.
///
/// Return the number of trailing line breaks to emit, depending on
/// \p ChompingIndicator.
static unsigned getChompedLineBreaks(char ChompingIndicator,
unsigned LineBreaks, StringRef Str) {
if (ChompingIndicator == '-') // Strip all line breaks.
return 0;
if (ChompingIndicator == '+') // Keep all line breaks.
return LineBreaks;
// Clip trailing lines.
return Str.empty() ? 0 : 1;
}
unsigned Scanner::scanBlockIndentationIndicator() {
unsigned Indent = 0;
if (Current != End && (*Current >= '1' && *Current <= '9')) {
Indent = unsigned(*Current - '0');
skip(1);
}
return Indent;
}
bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
unsigned &IndentIndicator, bool &IsDone) {
auto Start = Current;
ChompingIndicator = scanBlockChompingIndicator();
IndentIndicator = scanBlockIndentationIndicator();
// Check for the chomping indicator once again.
if (ChompingIndicator == ' ')
ChompingIndicator = scanBlockChompingIndicator();
Current = skip_while(&Scanner::skip_s_white, Current);
skipComment();
if (Current == End) { // EOF, we have an empty scalar.
Token T;
T.Kind = Token::TK_BlockScalar;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
IsDone = true;
return true;
}
if (!consumeLineBreakIfPresent()) {
setError("Expected a line break after block scalar header", Current);
return false;
}
return true;
}
bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
unsigned BlockExitIndent,
unsigned &LineBreaks, bool &IsDone) {
unsigned MaxAllSpaceLineCharacters = 0;
StringRef::iterator LongestAllSpaceLine;
while (true) {
advanceWhile(&Scanner::skip_s_space);
if (skip_nb_char(Current) != Current) {
// This line isn't empty, so try and find the indentation.
if (Column <= BlockExitIndent) { // End of the block literal.
IsDone = true;
return true;
}
// We found the block's indentation.
BlockIndent = Column;
if (MaxAllSpaceLineCharacters > BlockIndent) {
setError(
"Leading all-spaces line must be smaller than the block indent",
LongestAllSpaceLine);
return false;
}
return true;
}
if (skip_b_break(Current) != Current &&
Column > MaxAllSpaceLineCharacters) {
// Record the longest all-space line in case it's longer than the
// discovered block indent.
MaxAllSpaceLineCharacters = Column;
LongestAllSpaceLine = Current;
}
// Check for EOF.
if (Current == End) {
IsDone = true;
return true;
}
if (!consumeLineBreakIfPresent()) {
IsDone = true;
return true;
}
++LineBreaks;
}
return true;
}
bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
unsigned BlockExitIndent, bool &IsDone) {
// Skip the indentation.
while (Column < BlockIndent) {
auto I = skip_s_space(Current);
if (I == Current)
break;
Current = I;
++Column;
}
if (skip_nb_char(Current) == Current)
return true;
if (Column <= BlockExitIndent) { // End of the block literal.
IsDone = true;
return true;
}
if (Column < BlockIndent) {
if (Current != End && *Current == '#') { // Trailing comment.
IsDone = true;
return true;
}
setError("A text line is less indented than the block scalar", Current);
return false;
}
return true; // A normal text line.
}
bool Scanner::scanBlockScalar(bool IsLiteral) {
// Eat '|' or '>'
assert(*Current == '|' || *Current == '>');
skip(1);
char ChompingIndicator;
unsigned BlockIndent;
bool IsDone = false;
if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
return false;
if (IsDone)
return true;
auto Start = Current;
unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
unsigned LineBreaks = 0;
if (BlockIndent == 0) {
if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
IsDone))
return false;
}
// Scan the block's scalars body.
SmallString<256> Str;
while (!IsDone) {
if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
return false;
if (IsDone)
break;
// Parse the current line.
auto LineStart = Current;
advanceWhile(&Scanner::skip_nb_char);
if (LineStart != Current) {
Str.append(LineBreaks, '\n');
Str.append(StringRef(LineStart, Current - LineStart));
LineBreaks = 0;
}
// Check for EOF.
if (Current == End)
break;
if (!consumeLineBreakIfPresent())
break;
++LineBreaks;
}
if (Current == End && !LineBreaks)
// Ensure that there is at least one line break before the end of file.
LineBreaks = 1;
Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
// New lines may start a simple key.
if (!FlowLevel)
IsSimpleKeyAllowed = true;
Token T;
T.Kind = Token::TK_BlockScalar;
T.Range = StringRef(Start, Current - Start);
T.Value = Str.str().str();
TokenQueue.push_back(T);
return true;
}
bool Scanner::scanTag() {
StringRef::iterator Start = Current;
unsigned ColStart = Column;
skip(1); // Eat !.
if (Current == End || isBlankOrBreak(Current)); // An empty tag.
else if (*Current == '<') {
skip(1);
scan_ns_uri_char();
if (!consume('>'))
return false;
} else {
// FIXME: Actually parse the c-ns-shorthand-tag rule.
Current = skip_while(&Scanner::skip_ns_char, Current);
}
Token T;
T.Kind = Token::TK_Tag;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
// Tags can be simple keys.
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
IsSimpleKeyAllowed = false;
return true;
}
bool Scanner::fetchMoreTokens() {
if (IsStartOfStream)
return scanStreamStart();
scanToNextToken();
if (Current == End)
return scanStreamEnd();
removeStaleSimpleKeyCandidates();
unrollIndent(Column);
if (Column == 0 && *Current == '%')
return scanDirective();
if (Column == 0 && Current + 4 <= End
&& *Current == '-'
&& *(Current + 1) == '-'
&& *(Current + 2) == '-'
&& (Current + 3 == End || isBlankOrBreak(Current + 3)))
return scanDocumentIndicator(true);
if (Column == 0 && Current + 4 <= End
&& *Current == '.'
&& *(Current + 1) == '.'
&& *(Current + 2) == '.'
&& (Current + 3 == End || isBlankOrBreak(Current + 3)))
return scanDocumentIndicator(false);
if (*Current == '[')
return scanFlowCollectionStart(true);
if (*Current == '{')
return scanFlowCollectionStart(false);
if (*Current == ']')
return scanFlowCollectionEnd(true);
if (*Current == '}')
return scanFlowCollectionEnd(false);
if (*Current == ',')
return scanFlowEntry();
if (*Current == '-' && isBlankOrBreak(Current + 1))
return scanBlockEntry();
if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
return scanKey();
if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
return scanValue();
if (*Current == '*')
return scanAliasOrAnchor(true);
if (*Current == '&')
return scanAliasOrAnchor(false);
if (*Current == '!')
return scanTag();
if (*Current == '|' && !FlowLevel)
return scanBlockScalar(true);
if (*Current == '>' && !FlowLevel)
return scanBlockScalar(false);
if (*Current == '\'')
return scanFlowScalar(false);
if (*Current == '"')
return scanFlowScalar(true);
// Get a plain scalar.
StringRef FirstChar(Current, 1);
if (!(isBlankOrBreak(Current)
|| FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
|| (*Current == '-' && !isBlankOrBreak(Current + 1))
|| (!FlowLevel && (*Current == '?' || *Current == ':')
&& isBlankOrBreak(Current + 1))
|| (!FlowLevel && *Current == ':'
&& Current + 2 < End
&& *(Current + 1) == ':'
&& !isBlankOrBreak(Current + 2)))
return scanPlainScalar();
setError("Unrecognized character while tokenizing.");
return false;
}
Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
std::error_code *EC)
: scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
std::error_code *EC)
: scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
Stream::~Stream() = default;
bool Stream::failed() { return scanner->failed(); }
void Stream::printError(Node *N, const Twine &Msg) {
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
SMRange Range = N ? N->getSourceRange() : SMRange();
scanner->printError( Range.Start
, SourceMgr::DK_Error
, Msg
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
, Range);
}
document_iterator Stream::begin() {
if (CurrentDoc)
report_fatal_error("Can only iterate over the stream once");
// Skip Stream-Start.
scanner->getNext();
CurrentDoc.reset(new Document(*this));
return document_iterator(CurrentDoc);
}
document_iterator Stream::end() {
return document_iterator();
}
void Stream::skip() {
for (document_iterator i = begin(), e = end(); i != e; ++i)
i->skip();
}
Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
StringRef T)
: Doc(D), TypeID(Type), Anchor(A), Tag(T) {
SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
SourceRange = SMRange(Start, Start);
}
std::string Node::getVerbatimTag() const {
StringRef Raw = getRawTag();
if (!Raw.empty() && Raw != "!") {
std::string Ret;
if (Raw.find_last_of('!') == 0) {
Ret = std::string(Doc->getTagMap().find("!")->second);
Ret += Raw.substr(1);
return Ret;
} else if (Raw.startswith("!!")) {
Ret = std::string(Doc->getTagMap().find("!!")->second);
Ret += Raw.substr(2);
return Ret;
} else {
StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
std::map<StringRef, StringRef>::const_iterator It =
Doc->getTagMap().find(TagHandle);
if (It != Doc->getTagMap().end())
Ret = std::string(It->second);
else {
Token T;
T.Kind = Token::TK_Tag;
T.Range = TagHandle;
setError(Twine("Unknown tag handle ") + TagHandle, T);
}
Ret += Raw.substr(Raw.find_last_of('!') + 1);
return Ret;
}
}
switch (getType()) {
case NK_Null:
return "tag:yaml.org,2002:null";
case NK_Scalar:
case NK_BlockScalar:
// TODO: Tag resolution.
return "tag:yaml.org,2002:str";
case NK_Mapping:
return "tag:yaml.org,2002:map";
case NK_Sequence:
return "tag:yaml.org,2002:seq";
}
return "";
}
Token &Node::peekNext() {
return Doc->peekNext();
}
Token Node::getNext() {
return Doc->getNext();
}
Node *Node::parseBlockNode() {
return Doc->parseBlockNode();
}
BumpPtrAllocator &Node::getAllocator() {
return Doc->NodeAllocator;
}
void Node::setError(const Twine &Msg, Token &Tok) const {
Doc->setError(Msg, Tok);
}
bool Node::failed() const {
return Doc->failed();
}
StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
// TODO: Handle newlines properly. We need to remove leading whitespace.
if (Value[0] == '"') { // Double quoted.
// Pull off the leading and trailing "s.
StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
// Search for characters that would require unescaping the value.
StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
if (i != StringRef::npos)
return unescapeDoubleQuoted(UnquotedValue, i, Storage);
return UnquotedValue;
} else if (Value[0] == '\'') { // Single quoted.
// Pull off the leading and trailing 's.
StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
StringRef::size_type i = UnquotedValue.find('\'');
if (i != StringRef::npos) {
// We're going to need Storage.
Storage.clear();
Storage.reserve(UnquotedValue.size());
for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
StringRef Valid(UnquotedValue.begin(), i);
Storage.insert(Storage.end(), Valid.begin(), Valid.end());
Storage.push_back('\'');
UnquotedValue = UnquotedValue.substr(i + 2);
}
Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
return StringRef(Storage.begin(), Storage.size());
}
return UnquotedValue;
}
// Plain or block.
return Value.rtrim(' ');
}
StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
, StringRef::size_type i
, SmallVectorImpl<char> &Storage)
const {
// Use Storage to build proper value.
Storage.clear();
Storage.reserve(UnquotedValue.size());
for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
// Insert all previous chars into Storage.
StringRef Valid(UnquotedValue.begin(), i);
Storage.insert(Storage.end(), Valid.begin(), Valid.end());
// Chop off inserted chars.
UnquotedValue = UnquotedValue.substr(i);
assert(!UnquotedValue.empty() && "Can't be empty!");
// Parse escape or line break.
switch (UnquotedValue[0]) {
case '\r':
case '\n':
Storage.push_back('\n');
if ( UnquotedValue.size() > 1
&& (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
UnquotedValue = UnquotedValue.substr(1);
UnquotedValue = UnquotedValue.substr(1);
break;
default:
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
if (UnquotedValue.size() == 1) {
Token T;
T.Range = StringRef(UnquotedValue.begin(), 1);
setError("Unrecognized escape code", T);
return "";
}
UnquotedValue = UnquotedValue.substr(1);
switch (UnquotedValue[0]) {
default: {
Token T;
T.Range = StringRef(UnquotedValue.begin(), 1);
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
setError("Unrecognized escape code", T);
return "";
}
case '\r':
case '\n':
// Remove the new line.
if ( UnquotedValue.size() > 1
&& (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
UnquotedValue = UnquotedValue.substr(1);
// If this was just a single byte newline, it will get skipped
// below.
break;
case '0':
Storage.push_back(0x00);
break;
case 'a':
Storage.push_back(0x07);
break;
case 'b':
Storage.push_back(0x08);
break;
case 't':
case 0x09:
Storage.push_back(0x09);
break;
case 'n':
Storage.push_back(0x0A);
break;
case 'v':
Storage.push_back(0x0B);
break;
case 'f':
Storage.push_back(0x0C);
break;
case 'r':
Storage.push_back(0x0D);
break;
case 'e':
Storage.push_back(0x1B);
break;
case ' ':
Storage.push_back(0x20);
break;
case '"':
Storage.push_back(0x22);
break;
case '/':
Storage.push_back(0x2F);
break;
case '\\':
Storage.push_back(0x5C);
break;
case 'N':
encodeUTF8(0x85, Storage);
break;
case '_':
encodeUTF8(0xA0, Storage);
break;
case 'L':
encodeUTF8(0x2028, Storage);
break;
case 'P':
encodeUTF8(0x2029, Storage);
break;
case 'x': {
if (UnquotedValue.size() < 3)
// TODO: Report error.
break;
unsigned int UnicodeScalarValue;
if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
// TODO: Report error.
UnicodeScalarValue = 0xFFFD;
encodeUTF8(UnicodeScalarValue, Storage);
UnquotedValue = UnquotedValue.substr(2);
break;
}
case 'u': {
if (UnquotedValue.size() < 5)
// TODO: Report error.
break;
unsigned int UnicodeScalarValue;
if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
// TODO: Report error.
UnicodeScalarValue = 0xFFFD;
encodeUTF8(UnicodeScalarValue, Storage);
UnquotedValue = UnquotedValue.substr(4);
break;
}
case 'U': {
if (UnquotedValue.size() < 9)
// TODO: Report error.
break;
unsigned int UnicodeScalarValue;
if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
// TODO: Report error.
UnicodeScalarValue = 0xFFFD;
encodeUTF8(UnicodeScalarValue, Storage);
UnquotedValue = UnquotedValue.substr(8);
break;
}
}
UnquotedValue = UnquotedValue.substr(1);
}
}
Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
return StringRef(Storage.begin(), Storage.size());
}
Node *KeyValueNode::getKey() {
if (Key)
return Key;
// Handle implicit null keys.
{
Token &t = peekNext();
if ( t.Kind == Token::TK_BlockEnd
|| t.Kind == Token::TK_Value
|| t.Kind == Token::TK_Error) {
return Key = new (getAllocator()) NullNode(Doc);
}
if (t.Kind == Token::TK_Key)
getNext(); // skip TK_Key.
}
// Handle explicit null keys.
Token &t = peekNext();
if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
return Key = new (getAllocator()) NullNode(Doc);
}
// We've got a normal key.
return Key = parseBlockNode();
}
Node *KeyValueNode::getValue() {
if (Value)
return Value;
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
if (Node* Key = getKey())
Key->skip();
else {
setError("Null key in Key Value.", peekNext());
return Value = new (getAllocator()) NullNode(Doc);
}
if (failed())
return Value = new (getAllocator()) NullNode(Doc);
// Handle implicit null values.
{
Token &t = peekNext();
if ( t.Kind == Token::TK_BlockEnd
|| t.Kind == Token::TK_FlowMappingEnd
|| t.Kind == Token::TK_Key
|| t.Kind == Token::TK_FlowEntry
|| t.Kind == Token::TK_Error) {
return Value = new (getAllocator()) NullNode(Doc);
}
if (t.Kind != Token::TK_Value) {
setError("Unexpected token in Key Value.", t);
return Value = new (getAllocator()) NullNode(Doc);
}
getNext(); // skip TK_Value.
}
// Handle explicit null values.
Token &t = peekNext();
if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
return Value = new (getAllocator()) NullNode(Doc);
}
// We got a normal value.
return Value = parseBlockNode();
}
void MappingNode::increment() {
if (failed()) {
IsAtEnd = true;
CurrentEntry = nullptr;
return;
}
if (CurrentEntry) {
CurrentEntry->skip();
if (Type == MT_Inline) {
IsAtEnd = true;
CurrentEntry = nullptr;
return;
}
}
Token T = peekNext();
if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
// KeyValueNode eats the TK_Key. That way it can detect null keys.
CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
} else if (Type == MT_Block) {
switch (T.Kind) {
case Token::TK_BlockEnd:
getNext();
IsAtEnd = true;
CurrentEntry = nullptr;
break;
default:
setError("Unexpected token. Expected Key or Block End", T);
LLVM_FALLTHROUGH;
case Token::TK_Error:
IsAtEnd = true;
CurrentEntry = nullptr;
}
} else {
switch (T.Kind) {
case Token::TK_FlowEntry:
// Eat the flow entry and recurse.
getNext();
return increment();
case Token::TK_FlowMappingEnd:
getNext();
LLVM_FALLTHROUGH;
case Token::TK_Error:
// Set this to end iterator.
IsAtEnd = true;
CurrentEntry = nullptr;
break;
default:
setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
"Mapping End."
, T);
IsAtEnd = true;
CurrentEntry = nullptr;
}
}
}
void SequenceNode::increment() {
if (failed()) {
IsAtEnd = true;
CurrentEntry = nullptr;
return;
}
if (CurrentEntry)
CurrentEntry->skip();
Token T = peekNext();
if (SeqType == ST_Block) {
switch (T.Kind) {
case Token::TK_BlockEntry:
getNext();
CurrentEntry = parseBlockNode();
if (!CurrentEntry) { // An error occurred.
IsAtEnd = true;
CurrentEntry = nullptr;
}
break;
case Token::TK_BlockEnd:
getNext();
IsAtEnd = true;
CurrentEntry = nullptr;
break;
default:
setError( "Unexpected token. Expected Block Entry or Block End."
, T);
LLVM_FALLTHROUGH;
case Token::TK_Error:
IsAtEnd = true;
CurrentEntry = nullptr;
}
} else if (SeqType == ST_Indentless) {
switch (T.Kind) {
case Token::TK_BlockEntry:
getNext();
CurrentEntry = parseBlockNode();
if (!CurrentEntry) { // An error occurred.
IsAtEnd = true;
CurrentEntry = nullptr;
}
break;
default:
case Token::TK_Error:
IsAtEnd = true;
CurrentEntry = nullptr;
}
} else if (SeqType == ST_Flow) {
switch (T.Kind) {
case Token::TK_FlowEntry:
// Eat the flow entry and recurse.
getNext();
WasPreviousTokenFlowEntry = true;
return increment();
case Token::TK_FlowSequenceEnd:
getNext();
LLVM_FALLTHROUGH;
case Token::TK_Error:
// Set this to end iterator.
IsAtEnd = true;
CurrentEntry = nullptr;
break;
case Token::TK_StreamEnd:
case Token::TK_DocumentEnd:
case Token::TK_DocumentStart:
setError("Could not find closing ]!", T);
// Set this to end iterator.
IsAtEnd = true;
CurrentEntry = nullptr;
break;
default:
if (!WasPreviousTokenFlowEntry) {
setError("Expected , between entries!", T);
IsAtEnd = true;
CurrentEntry = nullptr;
break;
}
// Otherwise it must be a flow entry.
CurrentEntry = parseBlockNode();
if (!CurrentEntry) {
IsAtEnd = true;
}
WasPreviousTokenFlowEntry = false;
break;
}
}
}
Document::Document(Stream &S) : stream(S), Root(nullptr) {
// Tag maps starts with two default mappings.
TagMap["!"] = "!";
TagMap["!!"] = "tag:yaml.org,2002:";
if (parseDirectives())
expectToken(Token::TK_DocumentStart);
Token &T = peekNext();
if (T.Kind == Token::TK_DocumentStart)
getNext();
}
bool Document::skip() {
if (stream.scanner->failed())
return false;
if (!Root && !getRoot())
return false;
Root->skip();
Token &T = peekNext();
if (T.Kind == Token::TK_StreamEnd)
return false;
if (T.Kind == Token::TK_DocumentEnd) {
getNext();
return skip();
}
return true;
}
Token &Document::peekNext() {
return stream.scanner->peekNext();
}
Token Document::getNext() {
return stream.scanner->getNext();
}
void Document::setError(const Twine &Message, Token &Location) const {
stream.scanner->setError(Message, Location.Range.begin());
}
bool Document::failed() const {
return stream.scanner->failed();
}
Node *Document::parseBlockNode() {
Token T = peekNext();
// Handle properties.
Token AnchorInfo;
Token TagInfo;
parse_property:
switch (T.Kind) {
case Token::TK_Alias:
getNext();
return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
case Token::TK_Anchor:
if (AnchorInfo.Kind == Token::TK_Anchor) {
setError("Already encountered an anchor for this node!", T);
return nullptr;
}
AnchorInfo = getNext(); // Consume TK_Anchor.
T = peekNext();
goto parse_property;
case Token::TK_Tag:
if (TagInfo.Kind == Token::TK_Tag) {
setError("Already encountered a tag for this node!", T);
return nullptr;
}
TagInfo = getNext(); // Consume TK_Tag.
T = peekNext();
goto parse_property;
default:
break;
}
switch (T.Kind) {
case Token::TK_BlockEntry:
// We got an unindented BlockEntry sequence. This is not terminated with
// a BlockEnd.
// Don't eat the TK_BlockEntry, SequenceNode needs it.
return new (NodeAllocator) SequenceNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, SequenceNode::ST_Indentless);
case Token::TK_BlockSequenceStart:
getNext();
return new (NodeAllocator)
SequenceNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, SequenceNode::ST_Block);
case Token::TK_BlockMappingStart:
getNext();
return new (NodeAllocator)
MappingNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, MappingNode::MT_Block);
case Token::TK_FlowSequenceStart:
getNext();
return new (NodeAllocator)
SequenceNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, SequenceNode::ST_Flow);
case Token::TK_FlowMappingStart:
getNext();
return new (NodeAllocator)
MappingNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, MappingNode::MT_Flow);
case Token::TK_Scalar:
getNext();
return new (NodeAllocator)
ScalarNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, T.Range);
case Token::TK_BlockScalar: {
getNext();
StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
return new (NodeAllocator)
BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
TagInfo.Range, StrCopy, T.Range);
}
case Token::TK_Key:
// Don't eat the TK_Key, KeyValueNode expects it.
return new (NodeAllocator)
MappingNode( stream.CurrentDoc
, AnchorInfo.Range.substr(1)
, TagInfo.Range
, MappingNode::MT_Inline);
case Token::TK_DocumentStart:
case Token::TK_DocumentEnd:
case Token::TK_StreamEnd:
default:
// TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
// !!null null.
return new (NodeAllocator) NullNode(stream.CurrentDoc);
YAML parser robustness improvements Summary: This patch fixes a number of bugs found in the YAML parser through fuzzing. In general, this makes the parser more robust against malformed inputs. The fixes are mostly improved null checking and returning errors in more cases. In some cases, asserts were changed to regular errors, this provides the same robustness but also protects release builds from the triggering conditions. This also improves the fuzzability of the YAML parser since asserts can act as a roadblock to further fuzzing once they're hit. Each fix has a corresponding test case: - TestAnchorMapError - Added proper null pointer handling in `Stream::printError` if N is null and `KeyValueNode::getValue` if getKey returns null, `Input::createHNodes` `dyn_casts` changed to `dyn_cast_or_null` so the null pointer checks are actually able to fail - TestFlowSequenceTokenErrors - Added case in `Document::parseBlockNode` for FlowMappingEnd, FlowSequenceEnd, or FlowEntry tokens outside of mappings or sequences - TestDirectiveMappingNoValue - Changed assert to regular error return in `Scanner::scanValue` - TestUnescapeInfiniteLoop - Fixed infinite loop in `ScalarNode::unescapeDoubleQuoted` by returning an error for unrecognized escape codes - TestScannerUnexpectedCharacter - Changed asserts to regular error returns in `Scanner::consume` - TestUnknownDirective - For both of the inputs the stream doesn't fail and correctly returns TK_Error, but there is no valid root node for the document. There's no reasonable way to make the scanner fail for unknown directives without breaking the YAML spec (see spec-07-01.test). I think the assert is unnecessary given that an error is still generated for this case. The `SimpleKeys.clear()` line fixes a bug found by AddressSanitizer triggered by multiple test cases - when TokenQueue is cleared SimpleKeys is still holding dangling pointers into it, so SimpleKeys should be cleared as well. Patch by Thomas Finch! Reviewers: chandlerc, Bigcheese, hintonda Reviewed By: Bigcheese, hintonda Subscribers: hintonda, kristina, beanz, dexonsmith, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61608
2019-11-06 13:51:04 +08:00
case Token::TK_FlowMappingEnd:
case Token::TK_FlowSequenceEnd:
case Token::TK_FlowEntry: {
if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root)))
return new (NodeAllocator) NullNode(stream.CurrentDoc);
setError("Unexpected token", T);
return nullptr;
}
case Token::TK_Error:
return nullptr;
}
llvm_unreachable("Control flow shouldn't reach here.");
return nullptr;
}
bool Document::parseDirectives() {
bool isDirective = false;
while (true) {
Token T = peekNext();
if (T.Kind == Token::TK_TagDirective) {
parseTAGDirective();
isDirective = true;
} else if (T.Kind == Token::TK_VersionDirective) {
parseYAMLDirective();
isDirective = true;
} else
break;
}
return isDirective;
}
void Document::parseYAMLDirective() {
getNext(); // Eat %YAML <version>
}
void Document::parseTAGDirective() {
Token Tag = getNext(); // %TAG <handle> <prefix>
StringRef T = Tag.Range;
// Strip %TAG
T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
std::size_t HandleEnd = T.find_first_of(" \t");
StringRef TagHandle = T.substr(0, HandleEnd);
StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
TagMap[TagHandle] = TagPrefix;
}
bool Document::expectToken(int TK) {
Token T = getNext();
if (T.Kind != TK) {
setError("Unexpected token", T);
return false;
}
return true;
}