Restructure comment lexing to not recurse.

In some files that have large amounts of comments, this can lead to a stack overflow. PiperOrigin-RevId: 279867330
2019-11-11 19:14:43 -08:00 · 2019-11-11 19:14:43 -08:00 · 6582489219
parent 9b9c647cef
commit 6582489219
2 changed files with 94 additions and 99 deletions
--- a/mlir/lib/Parser/Lexer.cpp
+++ b/mlir/lib/Parser/Lexer.cpp
@ -63,114 +63,107 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }
 Token Lexer::lexToken() {
  // Ignore whitespace.
  while (true) {
-    switch (*curPtr) {
+    const char *tokStart = curPtr;
    switch (*curPtr++) {
    default:
      // Handle bare identifiers.
      if (isalpha(curPtr[-1]))
        return lexBareIdentifierOrKeyword(tokStart);
      // Unknown character, emit an error.
      return emitError(tokStart, "unexpected character");
    case ' ':
    case '\t':
    case '\n':
    case '\r':
-      ++curPtr;
+      // Handle whitespace.
      continue;
    default:
      // Terminate loop on non-whitespace, including either an embedded or
      // final terminating nul character that llvm::MemoryBuffer guarantees
      // will be there.
      break;
    }
    break;
  }
-  const char *tokStart = curPtr;
+    case '_':
-  switch (*curPtr++) {
+      // Handle bare identifiers.
  default:
    // Handle bare identifiers.
    if (isalpha(curPtr[-1]))
      return lexBareIdentifierOrKeyword(tokStart);
-    // Unknown character, emit an error.
+    case 0:
-    return emitError(tokStart, "unexpected character");
+      // This may either be a nul character in the source file or may be the EOF
      // marker that llvm::MemoryBuffer guarantees will be there.
      if (curPtr - 1 == curBuffer.end())
        return formToken(Token::eof, tokStart);
-  case '_':
+      LLVM_FALLTHROUGH;
-    // Handle bare identifiers.
+    case ':':
-    return lexBareIdentifierOrKeyword(tokStart);
+      return formToken(Token::colon, tokStart);
    case ',':
      return formToken(Token::comma, tokStart);
    case '.':
      return lexEllipsis(tokStart);
    case '(':
      return formToken(Token::l_paren, tokStart);
    case ')':
      return formToken(Token::r_paren, tokStart);
    case '{':
      return formToken(Token::l_brace, tokStart);
    case '}':
      return formToken(Token::r_brace, tokStart);
    case '[':
      return formToken(Token::l_square, tokStart);
    case ']':
      return formToken(Token::r_square, tokStart);
    case '<':
      return formToken(Token::less, tokStart);
    case '>':
      return formToken(Token::greater, tokStart);
    case '=':
      return formToken(Token::equal, tokStart);
-  case 0:
+    case '+':
-    // This may either be a nul character in the source file or may be the EOF
+      return formToken(Token::plus, tokStart);
-    // marker that llvm::MemoryBuffer guarantees will be there.
+    case '*':
-    if (curPtr - 1 == curBuffer.end())
+      return formToken(Token::star, tokStart);
-      return formToken(Token::eof, tokStart);
+    case '-':
      if (*curPtr == '>') {
        ++curPtr;
        return formToken(Token::arrow, tokStart);
      }
      return formToken(Token::minus, tokStart);
-    LLVM_FALLTHROUGH;
+    case '?':
-  case ':':
+      return formToken(Token::question, tokStart);
    return formToken(Token::colon, tokStart);
  case ',':
    return formToken(Token::comma, tokStart);
  case '.':
    return lexEllipsis(tokStart);
  case '(':
    return formToken(Token::l_paren, tokStart);
  case ')':
    return formToken(Token::r_paren, tokStart);
  case '{':
    return formToken(Token::l_brace, tokStart);
  case '}':
    return formToken(Token::r_brace, tokStart);
  case '[':
    return formToken(Token::l_square, tokStart);
  case ']':
    return formToken(Token::r_square, tokStart);
  case '<':
    return formToken(Token::less, tokStart);
  case '>':
    return formToken(Token::greater, tokStart);
  case '=':
    return formToken(Token::equal, tokStart);
-  case '+':
+    case '/':
-    return formToken(Token::plus, tokStart);
+      if (*curPtr == '/') {
-  case '*':
+        skipComment();
-    return formToken(Token::star, tokStart);
+        continue;
-  case '-':
+      }
-    if (*curPtr == '>') {
+      return emitError(tokStart, "unexpected character");
-      ++curPtr;
+
-      return formToken(Token::arrow, tokStart);
+    case '@':
      return lexAtIdentifier(tokStart);
    case '!':
      LLVM_FALLTHROUGH;
    case '^':
      LLVM_FALLTHROUGH;
    case '#':
      LLVM_FALLTHROUGH;
    case '%':
      return lexPrefixedIdentifier(tokStart);
    case '"':
      return lexString(tokStart);
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      return lexNumber(tokStart);
    }
    return formToken(Token::minus, tokStart);
  case '?':
    return formToken(Token::question, tokStart);
  case '/':
    if (*curPtr == '/')
      return lexComment();
    return emitError(tokStart, "unexpected character");
  case '@':
    return lexAtIdentifier(tokStart);
  case '!':
    LLVM_FALLTHROUGH;
  case '^':
    LLVM_FALLTHROUGH;
  case '#':
    LLVM_FALLTHROUGH;
  case '%':
    return lexPrefixedIdentifier(tokStart);
  case '"':
    return lexString(tokStart);
  case '0':
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9':
    return lexNumber(tokStart);
  }
 }
@ -231,11 +224,11 @@ Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
  return Token(kind, spelling);
 }
-/// Lex a comment line, starting with a semicolon.
+/// Skip a comment line, starting with a '//'.
 ///
 ///   TODO: add a regex for comments here and to the spec.
 ///
-Token Lexer::lexComment() {
+void Lexer::skipComment() {
  // Advance over the second '/' in a '//' comment.
  assert(*curPtr == '/');
  ++curPtr;
@ -245,12 +238,12 @@ Token Lexer::lexComment() {
    case '\n':
    case '\r':
      // Newline is end of comment.
-      return lexToken();
+      return;
    case 0:
      // If this is the end of the buffer, end the comment.
      if (curPtr - 1 == curBuffer.end()) {
        --curPtr;
-        return lexToken();
+        return;
      }
      LLVM_FALLTHROUGH;
    default:
--- a/mlir/lib/Parser/Lexer.h
+++ b/mlir/lib/Parser/Lexer.h
@ -59,12 +59,14 @@ private:
  // Lexer implementation methods.
  Token lexAtIdentifier(const char *tokStart);
  Token lexBareIdentifierOrKeyword(const char *tokStart);
  Token lexComment();
  Token lexEllipsis(const char *tokStart);
  Token lexNumber(const char *tokStart);
  Token lexPrefixedIdentifier(const char *tokStart);
  Token lexString(const char *tokStart);
  /// Skip a comment line, starting with a '//'.
  void skipComment();
  const llvm::SourceMgr &sourceMgr;
  MLIRContext *context;