Restructure comment lexing to not recurse.

In some files that have large amounts of comments, this can lead to a stack overflow.

PiperOrigin-RevId: 279867330
This commit is contained in:
River Riddle 2019-11-11 19:14:43 -08:00 committed by A. Unique TensorFlower
parent 9b9c647cef
commit 6582489219
2 changed files with 94 additions and 99 deletions

View File

@ -63,114 +63,107 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
}
Token Lexer::lexToken() {
// Ignore whitespace.
while (true) {
switch (*curPtr) {
const char *tokStart = curPtr;
switch (*curPtr++) {
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error.
return emitError(tokStart, "unexpected character");
case ' ':
case '\t':
case '\n':
case '\r':
++curPtr;
// Handle whitespace.
continue;
default:
// Terminate loop on non-whitespace, including either an embedded or
// final terminating nul character that llvm::MemoryBuffer guarantees
// will be there.
break;
}
break;
}
const char *tokStart = curPtr;
switch (*curPtr++) {
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
case '_':
// Handle bare identifiers.
return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error.
return emitError(tokStart, "unexpected character");
case 0:
// This may either be a nul character in the source file or may be the EOF
// marker that llvm::MemoryBuffer guarantees will be there.
if (curPtr - 1 == curBuffer.end())
return formToken(Token::eof, tokStart);
case '_':
// Handle bare identifiers.
return lexBareIdentifierOrKeyword(tokStart);
LLVM_FALLTHROUGH;
case ':':
return formToken(Token::colon, tokStart);
case ',':
return formToken(Token::comma, tokStart);
case '.':
return lexEllipsis(tokStart);
case '(':
return formToken(Token::l_paren, tokStart);
case ')':
return formToken(Token::r_paren, tokStart);
case '{':
return formToken(Token::l_brace, tokStart);
case '}':
return formToken(Token::r_brace, tokStart);
case '[':
return formToken(Token::l_square, tokStart);
case ']':
return formToken(Token::r_square, tokStart);
case '<':
return formToken(Token::less, tokStart);
case '>':
return formToken(Token::greater, tokStart);
case '=':
return formToken(Token::equal, tokStart);
case 0:
// This may either be a nul character in the source file or may be the EOF
// marker that llvm::MemoryBuffer guarantees will be there.
if (curPtr - 1 == curBuffer.end())
return formToken(Token::eof, tokStart);
case '+':
return formToken(Token::plus, tokStart);
case '*':
return formToken(Token::star, tokStart);
case '-':
if (*curPtr == '>') {
++curPtr;
return formToken(Token::arrow, tokStart);
}
return formToken(Token::minus, tokStart);
LLVM_FALLTHROUGH;
case ':':
return formToken(Token::colon, tokStart);
case ',':
return formToken(Token::comma, tokStart);
case '.':
return lexEllipsis(tokStart);
case '(':
return formToken(Token::l_paren, tokStart);
case ')':
return formToken(Token::r_paren, tokStart);
case '{':
return formToken(Token::l_brace, tokStart);
case '}':
return formToken(Token::r_brace, tokStart);
case '[':
return formToken(Token::l_square, tokStart);
case ']':
return formToken(Token::r_square, tokStart);
case '<':
return formToken(Token::less, tokStart);
case '>':
return formToken(Token::greater, tokStart);
case '=':
return formToken(Token::equal, tokStart);
case '?':
return formToken(Token::question, tokStart);
case '+':
return formToken(Token::plus, tokStart);
case '*':
return formToken(Token::star, tokStart);
case '-':
if (*curPtr == '>') {
++curPtr;
return formToken(Token::arrow, tokStart);
case '/':
if (*curPtr == '/') {
skipComment();
continue;
}
return emitError(tokStart, "unexpected character");
case '@':
return lexAtIdentifier(tokStart);
case '!':
LLVM_FALLTHROUGH;
case '^':
LLVM_FALLTHROUGH;
case '#':
LLVM_FALLTHROUGH;
case '%':
return lexPrefixedIdentifier(tokStart);
case '"':
return lexString(tokStart);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return lexNumber(tokStart);
}
return formToken(Token::minus, tokStart);
case '?':
return formToken(Token::question, tokStart);
case '/':
if (*curPtr == '/')
return lexComment();
return emitError(tokStart, "unexpected character");
case '@':
return lexAtIdentifier(tokStart);
case '!':
LLVM_FALLTHROUGH;
case '^':
LLVM_FALLTHROUGH;
case '#':
LLVM_FALLTHROUGH;
case '%':
return lexPrefixedIdentifier(tokStart);
case '"':
return lexString(tokStart);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return lexNumber(tokStart);
}
}
@ -231,11 +224,11 @@ Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
return Token(kind, spelling);
}
/// Lex a comment line, starting with a semicolon.
/// Skip a comment line, starting with a '//'.
///
/// TODO: add a regex for comments here and to the spec.
///
Token Lexer::lexComment() {
void Lexer::skipComment() {
// Advance over the second '/' in a '//' comment.
assert(*curPtr == '/');
++curPtr;
@ -245,12 +238,12 @@ Token Lexer::lexComment() {
case '\n':
case '\r':
// Newline is end of comment.
return lexToken();
return;
case 0:
// If this is the end of the buffer, end the comment.
if (curPtr - 1 == curBuffer.end()) {
--curPtr;
return lexToken();
return;
}
LLVM_FALLTHROUGH;
default:

View File

@ -59,12 +59,14 @@ private:
// Lexer implementation methods.
Token lexAtIdentifier(const char *tokStart);
Token lexBareIdentifierOrKeyword(const char *tokStart);
Token lexComment();
Token lexEllipsis(const char *tokStart);
Token lexNumber(const char *tokStart);
Token lexPrefixedIdentifier(const char *tokStart);
Token lexString(const char *tokStart);
/// Skip a comment line, starting with a '//'.
void skipComment();
const llvm::SourceMgr &sourceMgr;
MLIRContext *context;