[llvm-rc] Have the tokenizer discard single & block comments.

This allows rc files to have comments.  Eventually we should
just use clang's c preprocessor, but that's a bit larger
effort for minimal gain, and this is straightforward.

Differential Revision: https://reviews.llvm.org/D38651

llvm-svn: 315207
This commit is contained in:
Zachary Turner 2017-10-09 15:46:13 +00:00
parent 2a61a821a0
commit bd3a9dbabb
4 changed files with 72 additions and 0 deletions

View File

@ -3,6 +3,14 @@ He11o LLVM
"RC string test.",L"Another RC string test.'&{",42,100
Block Comment Ident /*block /* // comment */ ifier
Line Comment // Identifier /*
/* Multi line
block
comment */
Multiple /* comments */ on /* a */ single // line
":))"

View File

@ -34,4 +34,13 @@
; CHECK-NEXT: Int: 42; int value = 42
; CHECK-NEXT: Comma: ,
; CHECK-NEXT: Int: 100; int value = 100
; CHECK-NEXT: Identifier: Block
; CHECK-NEXT: Identifier: Comment
; CHECK-NEXT: Identifier: Ident
; CHECK-NEXT: Identifier: ifier
; CHECK-NEXT: Identifier: Line
; CHECK-NEXT: Identifier: Comment
; CHECK-NEXT: Identifier: Multiple
; CHECK-NEXT: Identifier: on
; CHECK-NEXT: Identifier: single
; CHECK-NEXT: String: ":))"

View File

@ -121,6 +121,17 @@ private:
bool canStartString() const;
// Check if tokenizer can start reading a single line comment (e.g. a comment
// that begins with '//')
bool canStartLineComment() const;
// Check if tokenizer can start or finish reading a block comment (e.g. a
// comment that begins with '/*' and ends with '*/')
bool canStartBlockComment() const;
// Throw away all remaining characters on the current line.
void skipCurrentLine();
bool streamEof() const;
// Classify the token that is about to be read from the current position.
@ -134,6 +145,14 @@ private:
size_t DataLength, Pos;
};
void Tokenizer::skipCurrentLine() {
Pos = Data.find_first_of("\r\n", Pos);
Pos = Data.find_first_not_of("\r\n", Pos);
if (Pos == StringRef::npos)
Pos = DataLength;
}
Expected<std::vector<RCToken>> Tokenizer::run() {
Pos = 0;
std::vector<RCToken> Result;
@ -154,6 +173,10 @@ Expected<std::vector<RCToken>> Tokenizer::run() {
if (Error TokenError = consumeToken(TokenKind))
return std::move(TokenError);
// Comments are just deleted, don't bother saving them.
if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
continue;
RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
if (TokenKind == Kind::Identifier) {
processIdentifier(Token);
@ -195,6 +218,21 @@ Error Tokenizer::consumeToken(const Kind TokenKind) {
advance();
return Error::success();
case Kind::LineComment:
advance(2);
skipCurrentLine();
return Error::success();
case Kind::StartComment: {
advance(2);
auto EndPos = Data.find("*/", Pos);
if (EndPos == StringRef::npos)
return getStringError(
"Unclosed multi-line comment beginning at position " + Twine(Pos));
advance(EndPos - Pos);
advance(2);
return Error::success();
}
case Kind::Identifier:
while (!streamEof() && canContinueIdentifier())
advance();
@ -259,6 +297,16 @@ bool Tokenizer::canStartInt() const {
return std::isdigit(Data[Pos]);
}
bool Tokenizer::canStartBlockComment() const {
assert(!streamEof());
return Data.drop_front(Pos).startswith("/*");
}
bool Tokenizer::canStartLineComment() const {
assert(!streamEof());
return Data.drop_front(Pos).startswith("//");
}
bool Tokenizer::canContinueInt() const {
assert(!streamEof());
return std::isalnum(Data[Pos]);
@ -271,6 +319,11 @@ bool Tokenizer::canStartString() const {
bool Tokenizer::streamEof() const { return Pos == DataLength; }
Kind Tokenizer::classifyCurrentToken() const {
if (canStartBlockComment())
return Kind::StartComment;
if (canStartLineComment())
return Kind::LineComment;
if (canStartInt())
return Kind::Int;
if (canStartString())

View File

@ -18,6 +18,8 @@ TOKEN(Invalid) // Invalid token. Should not occur in a valid script.
TOKEN(Int) // Integer (decimal, octal or hexadecimal).
TOKEN(String) // String value.
TOKEN(Identifier) // Script identifier (resource name or type).
TOKEN(LineComment) // Beginning of single-line comment.
TOKEN(StartComment) // Beginning of multi-line comment.
// Short tokens. They usually consist of exactly one character.
// The definitions are of the form SHORT_TOKEN(TokenName, TokenChar).