llvm-project/mlir/lib/Parser/Lexer.cpp

//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements the lexer for the MLIR textual form.
//
//===----------------------------------------------------------------------===//

#include "Lexer.h"
#include "llvm/Support/SourceMgr.h"
using namespace mlir;
using llvm::SMLoc;
using llvm::SourceMgr;

Lexer::Lexer(llvm::SourceMgr &sourceMgr) : sourceMgr(sourceMgr) {
  auto bufferID = sourceMgr.getMainFileID();
  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
  curPtr = curBuffer.begin();
}

/// emitError - Emit an error message and return an Token::error token.
Token Lexer::emitError(const char *loc, const Twine &message) {
  // TODO(clattner): If/when we want to implement a -verify mode, this will need
  // to package up errors into SMDiagnostic and report them.
  sourceMgr.PrintMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error,
                         message);
  return formToken(Token::error, loc);
}

Token Lexer::lexToken() {
  const char *tokStart = curPtr;

  switch (*curPtr++) {
  default:
    // Handle bare identifiers.
    if (isalpha(curPtr[-1]))
      return lexBareIdentifierOrKeyword(tokStart);

    // Unknown character, emit an error.
    return emitError(tokStart, "unexpected character");

  case 0:
    // This may either be a nul character in the source file or may be the EOF
    // marker that llvm::MemoryBuffer guarantees will be there.
    if (curPtr-1 == curBuffer.end())
      return formToken(Token::eof, tokStart);

    LLVM_FALLTHROUGH;
  case ' ':
  case '\t':
  case '\n':
  case '\r':
    // Ignore whitespace.
    return lexToken();

  case ':': return formToken(Token::colon, tokStart);
  case ',': return formToken(Token::comma, tokStart);
  case '(': return formToken(Token::l_paren, tokStart);
  case ')': return formToken(Token::r_paren, tokStart);
  case '{': return formToken(Token::l_brace, tokStart);
  case '}': return formToken(Token::r_brace, tokStart);
  case '<': return formToken(Token::less, tokStart);
  case '>': return formToken(Token::greater, tokStart);

  case '-':
    if (*curPtr == '>') {
      ++curPtr;
      return formToken(Token::arrow, tokStart);
    }
    return emitError(tokStart, "unexpected character");

  case '?':
    if (*curPtr == '?') {
      ++curPtr;
      return formToken(Token::questionquestion, tokStart);
    }

    return formToken(Token::question, tokStart);

  case ';': return lexComment();
  case '@': return lexAtIdentifier(tokStart);

  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
    return lexNumber(tokStart);
  }
}

/// Lex a comment line, starting with a semicolon.
///
///   TODO: add a regex for comments here and to the spec.
///
Token Lexer::lexComment() {
  while (true) {
    switch (*curPtr++) {
    case '\n':
    case '\r':
      // Newline is end of comment.
      return lexToken();
    case 0:
      // If this is the end of the buffer, end the comment.
      if (curPtr-1 == curBuffer.end()) {
        --curPtr;
        return lexToken();
      }
      LLVM_FALLTHROUGH;
    default:
      // Skip over other characters.
      break;
    }
  }
}

/// Lex a bare identifier or keyword that starts with a letter.
///
///   bare-id ::= letter (letter|digit|[_])*
///
Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
  // Match the rest of the identifier regex: [0-9a-zA-Z_]*
  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
    ++curPtr;

  // Check to see if this identifier is a keyword.
  StringRef spelling(tokStart, curPtr-tokStart);

  Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
    .Case("bf16", Token::kw_bf16)
    .Case("br", Token::kw_br)
    .Case("cfgfunc", Token::kw_cfgfunc)
    .Case("extfunc", Token::kw_extfunc)
    .Case("f16", Token::kw_f16)
    .Case("f32", Token::kw_f32)
    .Case("f64", Token::kw_f64)
    .Case("i1", Token::kw_i1)
    .Case("i16", Token::kw_i16)
    .Case("i32", Token::kw_i32)
    .Case("i64", Token::kw_i64)
    .Case("i8", Token::kw_i8)
    .Case("int", Token::kw_int)
    .Case("memref", Token::kw_memref)
    .Case("mlfunc", Token::kw_mlfunc)
    .Case("return", Token::kw_return)
    .Case("tensor", Token::kw_tensor)
    .Case("vector", Token::kw_vector)
    .Default(Token::bare_identifier);

  return Token(kind, spelling);
}

/// Lex an '@foo' identifier.
///
///   function-id ::= `@` bare-id
///
Token Lexer::lexAtIdentifier(const char *tokStart) {
  // These always start with a letter.
  if (!isalpha(*curPtr++))
    return emitError(curPtr-1, "expected letter in @ identifier");

  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
    ++curPtr;
  return formToken(Token::at_identifier, tokStart);
}

/// Lex an integer literal.
///
///   integer-literal ::= digit+ | `0x` hex_digit+
///
Token Lexer::lexNumber(const char *tokStart) {
  assert(isdigit(curPtr[-1]));

  // Handle the hexadecimal case.
  if (curPtr[-1] == '0' && *curPtr == 'x') {
    ++curPtr;

    if (!isxdigit(*curPtr))
      return emitError(curPtr, "expected hexadecimal digit");

    while (isxdigit(*curPtr))
      ++curPtr;

    return formToken(Token::integer, tokStart);
  }

  // Handle the normal decimal case.
  while (isdigit(*curPtr))
    ++curPtr;

  return formToken(Token::integer, tokStart);
}
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//`
			`//`
			`// Copyright 2019 The MLIR Authors.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`// =============================================================================`
			`//`
			`// This file implements the lexer for the MLIR textual form.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "Lexer.h"`
			`#include "llvm/Support/SourceMgr.h"`
			`using namespace mlir;`
			`using llvm::SMLoc;`
			`using llvm::SourceMgr;`

			`Lexer::Lexer(llvm::SourceMgr &sourceMgr) : sourceMgr(sourceMgr) {`
			`auto bufferID = sourceMgr.getMainFileID();`
			`curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();`
			`curPtr = curBuffer.begin();`
			`}`

			`/// emitError - Emit an error message and return an Token::error token.`
			`Token Lexer::emitError(const char *loc, const Twine &message) {`
			`// TODO(clattner): If/when we want to implement a -verify mode, this will need`
			`// to package up errors into SMDiagnostic and report them.`
			`sourceMgr.PrintMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error,`
			`message);`
			`return formToken(Token::error, loc);`
			`}`

			`Token Lexer::lexToken() {`
			`const char *tokStart = curPtr;`

			`switch (*curPtr++) {`
			`default:`
			`// Handle bare identifiers.`
			`if (isalpha(curPtr[-1]))`
			`return lexBareIdentifierOrKeyword(tokStart);`

			`// Unknown character, emit an error.`
			`return emitError(tokStart, "unexpected character");`

			`case 0:`
			`// This may either be a nul character in the source file or may be the EOF`
			`// marker that llvm::MemoryBuffer guarantees will be there.`
			`if (curPtr-1 == curBuffer.end())`
			`return formToken(Token::eof, tokStart);`

			`LLVM_FALLTHROUGH;`
			`case ' ':`
			`case '\t':`
			`case '\n':`
			`case '\r':`
			`// Ignore whitespace.`
			`return lexToken();`

Implement parser/IR support for CFG functions, basic blocks and return instruction. This is pretty much minimal scaffolding for this step. Basic block arguments, instructions, other terminators, a proper IR representation for blocks/instructions, etc are all coming. PiperOrigin-RevId: 201826439 2018-06-24 07:03:42 +08:00			`case ':': return formToken(Token::colon, tokStart);`
Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00			`case ',': return formToken(Token::comma, tokStart);`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`case '(': return formToken(Token::l_paren, tokStart);`
			`case ')': return formToken(Token::r_paren, tokStart);`
Implement parser/IR support for CFG functions, basic blocks and return instruction. This is pretty much minimal scaffolding for this step. Basic block arguments, instructions, other terminators, a proper IR representation for blocks/instructions, etc are all coming. PiperOrigin-RevId: 201826439 2018-06-24 07:03:42 +08:00			`case '{': return formToken(Token::l_brace, tokStart);`
			`case '}': return formToken(Token::r_brace, tokStart);`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`case '<': return formToken(Token::less, tokStart);`
			`case '>': return formToken(Token::greater, tokStart);`

Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00			`case '-':`
			`if (*curPtr == '>') {`
			`++curPtr;`
			`return formToken(Token::arrow, tokStart);`
			`}`
			`return emitError(tokStart, "unexpected character");`

			`case '?':`
			`if (*curPtr == '?') {`
			`++curPtr;`
			`return formToken(Token::questionquestion, tokStart);`
			`}`

			`return formToken(Token::question, tokStart);`

Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`case ';': return lexComment();`
			`case '@': return lexAtIdentifier(tokStart);`
Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00
			`case '0': case '1': case '2': case '3': case '4':`
			`case '5': case '6': case '7': case '8': case '9':`
			`return lexNumber(tokStart);`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`}`
			`}`

			`/// Lex a comment line, starting with a semicolon.`
			`///`
			`/// TODO: add a regex for comments here and to the spec.`
			`///`
			`Token Lexer::lexComment() {`
			`while (true) {`
			`switch (*curPtr++) {`
			`case '\n':`
			`case '\r':`
			`// Newline is end of comment.`
			`return lexToken();`
			`case 0:`
			`// If this is the end of the buffer, end the comment.`
			`if (curPtr-1 == curBuffer.end()) {`
			`--curPtr;`
			`return lexToken();`
			`}`
			`LLVM_FALLTHROUGH;`
			`default:`
			`// Skip over other characters.`
			`break;`
			`}`
			`}`
			`}`

			`/// Lex a bare identifier or keyword that starts with a letter.`
			`///`
Add the unconditional branch instruction, improve diagnostics for block references. PiperOrigin-RevId: 201872745 2018-06-25 02:18:29 +08:00			`/// bare-id ::= letter (letter\|digit\|[_])*`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`///`
			`Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {`
Add the unconditional branch instruction, improve diagnostics for block references. PiperOrigin-RevId: 201872745 2018-06-25 02:18:29 +08:00			`// Match the rest of the identifier regex: [0-9a-zA-Z_]*`
			`while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_')`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`++curPtr;`

			`// Check to see if this identifier is a keyword.`
			`StringRef spelling(tokStart, curPtr-tokStart);`

			`Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)`
Add the unconditional branch instruction, improve diagnostics for block references. PiperOrigin-RevId: 201872745 2018-06-25 02:18:29 +08:00			`.Case("bf16", Token::kw_bf16)`
			`.Case("br", Token::kw_br)`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`.Case("cfgfunc", Token::kw_cfgfunc)`
			`.Case("extfunc", Token::kw_extfunc)`
Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00			`.Case("f16", Token::kw_f16)`
			`.Case("f32", Token::kw_f32)`
			`.Case("f64", Token::kw_f64)`
			`.Case("i1", Token::kw_i1)`
			`.Case("i16", Token::kw_i16)`
			`.Case("i32", Token::kw_i32)`
			`.Case("i64", Token::kw_i64)`
			`.Case("i8", Token::kw_i8)`
			`.Case("int", Token::kw_int)`
			`.Case("memref", Token::kw_memref)`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`.Case("mlfunc", Token::kw_mlfunc)`
Implement parser/IR support for CFG functions, basic blocks and return instruction. This is pretty much minimal scaffolding for this step. Basic block arguments, instructions, other terminators, a proper IR representation for blocks/instructions, etc are all coming. PiperOrigin-RevId: 201826439 2018-06-24 07:03:42 +08:00			`.Case("return", Token::kw_return)`
Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00			`.Case("tensor", Token::kw_tensor)`
			`.Case("vector", Token::kw_vector)`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`.Default(Token::bare_identifier);`

			`return Token(kind, spelling);`
			`}`

			`/// Lex an '@foo' identifier.`
			`///`
			/// function-id ::= `@` bare-id
			`///`
			`Token Lexer::lexAtIdentifier(const char *tokStart) {`
			`// These always start with a letter.`
			`if (!isalpha(*curPtr++))`
			`return emitError(curPtr-1, "expected letter in @ identifier");`

Add the unconditional branch instruction, improve diagnostics for block references. PiperOrigin-RevId: 201872745 2018-06-25 02:18:29 +08:00			`while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_')`
Implement enough of a lexer and parser for MLIR to parse extfunc's without arguments. PiperOrigin-RevId: 201706570 2018-06-23 01:39:19 +08:00			`++curPtr;`
			`return formToken(Token::at_identifier, tokStart);`
			`}`
Implement parser and lexer support for most of the type grammar. Semi-affine maps and address spaces are not yet supported (someone want to take this on?). We also don't generate IR objects for types yet, which I plan to tackle next. PiperOrigin-RevId: 201754283 2018-06-23 06:52:02 +08:00
			`/// Lex an integer literal.`
			`///`
			/// integer-literal ::= digit+ \| `0x` hex_digit+
			`///`
			`Token Lexer::lexNumber(const char *tokStart) {`
			`assert(isdigit(curPtr[-1]));`

			`// Handle the hexadecimal case.`
			`if (curPtr[-1] == '0' && *curPtr == 'x') {`
			`++curPtr;`

			`if (!isxdigit(*curPtr))`
			`return emitError(curPtr, "expected hexadecimal digit");`

			`while (isxdigit(*curPtr))`
			`++curPtr;`

			`return formToken(Token::integer, tokStart);`
			`}`

			`// Handle the normal decimal case.`
			`while (isdigit(*curPtr))`
			`++curPtr;`

			`return formToken(Token::integer, tokStart);`
			`}`