Restructure comment lexing to not recurse.

In some files that have large amounts of comments, this can lead to a stack overflow.

PiperOrigin-RevId: 279867330
This commit is contained in:
River Riddle 2019-11-11 19:14:43 -08:00 committed by A. Unique TensorFlower
parent 9b9c647cef
commit 6582489219
2 changed files with 94 additions and 99 deletions

View File

@ -63,114 +63,107 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
} }
Token Lexer::lexToken() { Token Lexer::lexToken() {
// Ignore whitespace.
while (true) { while (true) {
switch (*curPtr) { const char *tokStart = curPtr;
switch (*curPtr++) {
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error.
return emitError(tokStart, "unexpected character");
case ' ': case ' ':
case '\t': case '\t':
case '\n': case '\n':
case '\r': case '\r':
++curPtr; // Handle whitespace.
continue; continue;
default:
// Terminate loop on non-whitespace, including either an embedded or
// final terminating nul character that llvm::MemoryBuffer guarantees
// will be there.
break;
}
break;
}
const char *tokStart = curPtr; case '_':
switch (*curPtr++) { // Handle bare identifiers.
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
return lexBareIdentifierOrKeyword(tokStart); return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error. case 0:
return emitError(tokStart, "unexpected character"); // This may either be a nul character in the source file or may be the EOF
// marker that llvm::MemoryBuffer guarantees will be there.
if (curPtr - 1 == curBuffer.end())
return formToken(Token::eof, tokStart);
case '_': LLVM_FALLTHROUGH;
// Handle bare identifiers. case ':':
return lexBareIdentifierOrKeyword(tokStart); return formToken(Token::colon, tokStart);
case ',':
return formToken(Token::comma, tokStart);
case '.':
return lexEllipsis(tokStart);
case '(':
return formToken(Token::l_paren, tokStart);
case ')':
return formToken(Token::r_paren, tokStart);
case '{':
return formToken(Token::l_brace, tokStart);
case '}':
return formToken(Token::r_brace, tokStart);
case '[':
return formToken(Token::l_square, tokStart);
case ']':
return formToken(Token::r_square, tokStart);
case '<':
return formToken(Token::less, tokStart);
case '>':
return formToken(Token::greater, tokStart);
case '=':
return formToken(Token::equal, tokStart);
case 0: case '+':
// This may either be a nul character in the source file or may be the EOF return formToken(Token::plus, tokStart);
// marker that llvm::MemoryBuffer guarantees will be there. case '*':
if (curPtr - 1 == curBuffer.end()) return formToken(Token::star, tokStart);
return formToken(Token::eof, tokStart); case '-':
if (*curPtr == '>') {
++curPtr;
return formToken(Token::arrow, tokStart);
}
return formToken(Token::minus, tokStart);
LLVM_FALLTHROUGH; case '?':
case ':': return formToken(Token::question, tokStart);
return formToken(Token::colon, tokStart);
case ',':
return formToken(Token::comma, tokStart);
case '.':
return lexEllipsis(tokStart);
case '(':
return formToken(Token::l_paren, tokStart);
case ')':
return formToken(Token::r_paren, tokStart);
case '{':
return formToken(Token::l_brace, tokStart);
case '}':
return formToken(Token::r_brace, tokStart);
case '[':
return formToken(Token::l_square, tokStart);
case ']':
return formToken(Token::r_square, tokStart);
case '<':
return formToken(Token::less, tokStart);
case '>':
return formToken(Token::greater, tokStart);
case '=':
return formToken(Token::equal, tokStart);
case '+': case '/':
return formToken(Token::plus, tokStart); if (*curPtr == '/') {
case '*': skipComment();
return formToken(Token::star, tokStart); continue;
case '-': }
if (*curPtr == '>') { return emitError(tokStart, "unexpected character");
++curPtr;
return formToken(Token::arrow, tokStart); case '@':
return lexAtIdentifier(tokStart);
case '!':
LLVM_FALLTHROUGH;
case '^':
LLVM_FALLTHROUGH;
case '#':
LLVM_FALLTHROUGH;
case '%':
return lexPrefixedIdentifier(tokStart);
case '"':
return lexString(tokStart);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return lexNumber(tokStart);
} }
return formToken(Token::minus, tokStart);
case '?':
return formToken(Token::question, tokStart);
case '/':
if (*curPtr == '/')
return lexComment();
return emitError(tokStart, "unexpected character");
case '@':
return lexAtIdentifier(tokStart);
case '!':
LLVM_FALLTHROUGH;
case '^':
LLVM_FALLTHROUGH;
case '#':
LLVM_FALLTHROUGH;
case '%':
return lexPrefixedIdentifier(tokStart);
case '"':
return lexString(tokStart);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return lexNumber(tokStart);
} }
} }
@ -231,11 +224,11 @@ Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
return Token(kind, spelling); return Token(kind, spelling);
} }
/// Lex a comment line, starting with a semicolon. /// Skip a comment line, starting with a '//'.
/// ///
/// TODO: add a regex for comments here and to the spec. /// TODO: add a regex for comments here and to the spec.
/// ///
Token Lexer::lexComment() { void Lexer::skipComment() {
// Advance over the second '/' in a '//' comment. // Advance over the second '/' in a '//' comment.
assert(*curPtr == '/'); assert(*curPtr == '/');
++curPtr; ++curPtr;
@ -245,12 +238,12 @@ Token Lexer::lexComment() {
case '\n': case '\n':
case '\r': case '\r':
// Newline is end of comment. // Newline is end of comment.
return lexToken(); return;
case 0: case 0:
// If this is the end of the buffer, end the comment. // If this is the end of the buffer, end the comment.
if (curPtr - 1 == curBuffer.end()) { if (curPtr - 1 == curBuffer.end()) {
--curPtr; --curPtr;
return lexToken(); return;
} }
LLVM_FALLTHROUGH; LLVM_FALLTHROUGH;
default: default:

View File

@ -59,12 +59,14 @@ private:
// Lexer implementation methods. // Lexer implementation methods.
Token lexAtIdentifier(const char *tokStart); Token lexAtIdentifier(const char *tokStart);
Token lexBareIdentifierOrKeyword(const char *tokStart); Token lexBareIdentifierOrKeyword(const char *tokStart);
Token lexComment();
Token lexEllipsis(const char *tokStart); Token lexEllipsis(const char *tokStart);
Token lexNumber(const char *tokStart); Token lexNumber(const char *tokStart);
Token lexPrefixedIdentifier(const char *tokStart); Token lexPrefixedIdentifier(const char *tokStart);
Token lexString(const char *tokStart); Token lexString(const char *tokStart);
/// Skip a comment line, starting with a '//'.
void skipComment();
const llvm::SourceMgr &sourceMgr; const llvm::SourceMgr &sourceMgr;
MLIRContext *context; MLIRContext *context;