forked from OSchip/llvm-project
400 lines
11 KiB
C++
400 lines
11 KiB
C++
//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the lexer for the MLIR textual form.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Lexer.h"
|
|
#include "mlir/IR/Diagnostics.h"
|
|
#include "mlir/IR/Identifier.h"
|
|
#include "mlir/IR/Location.h"
|
|
#include "mlir/IR/MLIRContext.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/ADT/StringSwitch.h"
|
|
#include "llvm/Support/SourceMgr.h"
|
|
using namespace mlir;
|
|
|
|
using llvm::SMLoc;
|
|
using llvm::SourceMgr;
|
|
|
|
// Returns true if 'c' is an allowable punctuation character: [$._-]
|
|
// Returns false otherwise.
|
|
static bool isPunct(char c) {
|
|
return c == '$' || c == '.' || c == '_' || c == '-';
|
|
}
|
|
|
|
Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
|
|
: sourceMgr(sourceMgr), context(context) {
|
|
auto bufferID = sourceMgr.getMainFileID();
|
|
curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
|
|
curPtr = curBuffer.begin();
|
|
}
|
|
|
|
/// Encode the specified source location information into an attribute for
|
|
/// attachment to the IR.
|
|
Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
|
|
auto &sourceMgr = getSourceMgr();
|
|
unsigned mainFileID = sourceMgr.getMainFileID();
|
|
auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
|
|
auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
|
|
|
|
return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first,
|
|
lineAndColumn.second, context);
|
|
}
|
|
|
|
/// emitError - Emit an error message and return an Token::error token.
|
|
Token Lexer::emitError(const char *loc, const Twine &message) {
|
|
mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
|
|
message);
|
|
return formToken(Token::error, loc);
|
|
}
|
|
|
|
Token Lexer::lexToken() {
|
|
while (true) {
|
|
const char *tokStart = curPtr;
|
|
switch (*curPtr++) {
|
|
default:
|
|
// Handle bare identifiers.
|
|
if (isalpha(curPtr[-1]))
|
|
return lexBareIdentifierOrKeyword(tokStart);
|
|
|
|
// Unknown character, emit an error.
|
|
return emitError(tokStart, "unexpected character");
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\r':
|
|
// Handle whitespace.
|
|
continue;
|
|
|
|
case '_':
|
|
// Handle bare identifiers.
|
|
return lexBareIdentifierOrKeyword(tokStart);
|
|
|
|
case 0:
|
|
// This may either be a nul character in the source file or may be the EOF
|
|
// marker that llvm::MemoryBuffer guarantees will be there.
|
|
if (curPtr - 1 == curBuffer.end())
|
|
return formToken(Token::eof, tokStart);
|
|
continue;
|
|
|
|
case ':':
|
|
return formToken(Token::colon, tokStart);
|
|
case ',':
|
|
return formToken(Token::comma, tokStart);
|
|
case '.':
|
|
return lexEllipsis(tokStart);
|
|
case '(':
|
|
return formToken(Token::l_paren, tokStart);
|
|
case ')':
|
|
return formToken(Token::r_paren, tokStart);
|
|
case '{':
|
|
return formToken(Token::l_brace, tokStart);
|
|
case '}':
|
|
return formToken(Token::r_brace, tokStart);
|
|
case '[':
|
|
return formToken(Token::l_square, tokStart);
|
|
case ']':
|
|
return formToken(Token::r_square, tokStart);
|
|
case '<':
|
|
return formToken(Token::less, tokStart);
|
|
case '>':
|
|
return formToken(Token::greater, tokStart);
|
|
case '=':
|
|
return formToken(Token::equal, tokStart);
|
|
|
|
case '+':
|
|
return formToken(Token::plus, tokStart);
|
|
case '*':
|
|
return formToken(Token::star, tokStart);
|
|
case '-':
|
|
if (*curPtr == '>') {
|
|
++curPtr;
|
|
return formToken(Token::arrow, tokStart);
|
|
}
|
|
return formToken(Token::minus, tokStart);
|
|
|
|
case '?':
|
|
return formToken(Token::question, tokStart);
|
|
|
|
case '/':
|
|
if (*curPtr == '/') {
|
|
skipComment();
|
|
continue;
|
|
}
|
|
return emitError(tokStart, "unexpected character");
|
|
|
|
case '@':
|
|
return lexAtIdentifier(tokStart);
|
|
|
|
case '!':
|
|
LLVM_FALLTHROUGH;
|
|
case '^':
|
|
LLVM_FALLTHROUGH;
|
|
case '#':
|
|
LLVM_FALLTHROUGH;
|
|
case '%':
|
|
return lexPrefixedIdentifier(tokStart);
|
|
case '"':
|
|
return lexString(tokStart);
|
|
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
return lexNumber(tokStart);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Lex an '@foo' identifier.
|
|
///
|
|
/// symbol-ref-id ::= `@` (bare-id | string-literal)
|
|
///
|
|
Token Lexer::lexAtIdentifier(const char *tokStart) {
|
|
char cur = *curPtr++;
|
|
|
|
// Try to parse a string literal, if present.
|
|
if (cur == '"') {
|
|
Token stringIdentifier = lexString(curPtr);
|
|
if (stringIdentifier.is(Token::error))
|
|
return stringIdentifier;
|
|
return formToken(Token::at_identifier, tokStart);
|
|
}
|
|
|
|
// Otherwise, these always start with a letter or underscore.
|
|
if (!isalpha(cur) && cur != '_')
|
|
return emitError(curPtr - 1,
|
|
"@ identifier expected to start with letter or '_'");
|
|
|
|
while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
|
|
*curPtr == '$' || *curPtr == '.')
|
|
++curPtr;
|
|
return formToken(Token::at_identifier, tokStart);
|
|
}
|
|
|
|
/// Lex a bare identifier or keyword that starts with a letter.
|
|
///
|
|
/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
|
|
/// integer-type ::= `[su]?i[1-9][0-9]*`
|
|
///
|
|
Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
|
|
// Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
|
|
while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
|
|
*curPtr == '$' || *curPtr == '.')
|
|
++curPtr;
|
|
|
|
// Check to see if this identifier is a keyword.
|
|
StringRef spelling(tokStart, curPtr - tokStart);
|
|
|
|
auto isAllDigit = [](StringRef str) {
|
|
return llvm::all_of(str, [](char c) { return llvm::isDigit(c); });
|
|
};
|
|
|
|
// Check for i123, si456, ui789.
|
|
if ((spelling.size() > 1 && tokStart[0] == 'i' &&
|
|
isAllDigit(spelling.drop_front())) ||
|
|
((spelling.size() > 2 && tokStart[1] == 'i' &&
|
|
(tokStart[0] == 's' || tokStart[0] == 'u')) &&
|
|
isAllDigit(spelling.drop_front(2))))
|
|
return Token(Token::inttype, spelling);
|
|
|
|
Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
|
|
#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
|
|
#include "TokenKinds.def"
|
|
.Default(Token::bare_identifier);
|
|
|
|
return Token(kind, spelling);
|
|
}
|
|
|
|
/// Skip a comment line, starting with a '//'.
|
|
///
|
|
/// TODO: add a regex for comments here and to the spec.
|
|
///
|
|
void Lexer::skipComment() {
|
|
// Advance over the second '/' in a '//' comment.
|
|
assert(*curPtr == '/');
|
|
++curPtr;
|
|
|
|
while (true) {
|
|
switch (*curPtr++) {
|
|
case '\n':
|
|
case '\r':
|
|
// Newline is end of comment.
|
|
return;
|
|
case 0:
|
|
// If this is the end of the buffer, end the comment.
|
|
if (curPtr - 1 == curBuffer.end()) {
|
|
--curPtr;
|
|
return;
|
|
}
|
|
LLVM_FALLTHROUGH;
|
|
default:
|
|
// Skip over other characters.
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Lex an ellipsis.
|
|
///
|
|
/// ellipsis ::= '...'
|
|
///
|
|
Token Lexer::lexEllipsis(const char *tokStart) {
|
|
assert(curPtr[-1] == '.');
|
|
|
|
if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
|
|
return emitError(curPtr, "expected three consecutive dots for an ellipsis");
|
|
|
|
curPtr += 2;
|
|
return formToken(Token::ellipsis, tokStart);
|
|
}
|
|
|
|
/// Lex a number literal.
|
|
///
|
|
/// integer-literal ::= digit+ | `0x` hex_digit+
|
|
/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
|
|
///
|
|
Token Lexer::lexNumber(const char *tokStart) {
|
|
assert(isdigit(curPtr[-1]));
|
|
|
|
// Handle the hexadecimal case.
|
|
if (curPtr[-1] == '0' && *curPtr == 'x') {
|
|
// If we see stuff like 0xi32, this is a literal `0` followed by an
|
|
// identifier `xi32`, stop after `0`.
|
|
if (!isxdigit(curPtr[1]))
|
|
return formToken(Token::integer, tokStart);
|
|
|
|
curPtr += 2;
|
|
while (isxdigit(*curPtr))
|
|
++curPtr;
|
|
|
|
return formToken(Token::integer, tokStart);
|
|
}
|
|
|
|
// Handle the normal decimal case.
|
|
while (isdigit(*curPtr))
|
|
++curPtr;
|
|
|
|
if (*curPtr != '.')
|
|
return formToken(Token::integer, tokStart);
|
|
++curPtr;
|
|
|
|
// Skip over [0-9]*([eE][-+]?[0-9]+)?
|
|
while (isdigit(*curPtr))
|
|
++curPtr;
|
|
|
|
if (*curPtr == 'e' || *curPtr == 'E') {
|
|
if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
|
|
((curPtr[1] == '-' || curPtr[1] == '+') &&
|
|
isdigit(static_cast<unsigned char>(curPtr[2])))) {
|
|
curPtr += 2;
|
|
while (isdigit(*curPtr))
|
|
++curPtr;
|
|
}
|
|
}
|
|
return formToken(Token::floatliteral, tokStart);
|
|
}
|
|
|
|
/// Lex an identifier that starts with a prefix followed by suffix-id.
|
|
///
|
|
/// attribute-id ::= `#` suffix-id
|
|
/// ssa-id ::= '%' suffix-id
|
|
/// block-id ::= '^' suffix-id
|
|
/// type-id ::= '!' suffix-id
|
|
/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
|
|
/// id-punct ::= `$` | `.` | `_` | `-`
|
|
///
|
|
Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
|
|
Token::Kind kind;
|
|
StringRef errorKind;
|
|
switch (*tokStart) {
|
|
case '#':
|
|
kind = Token::hash_identifier;
|
|
errorKind = "invalid attribute name";
|
|
break;
|
|
case '%':
|
|
kind = Token::percent_identifier;
|
|
errorKind = "invalid SSA name";
|
|
break;
|
|
case '^':
|
|
kind = Token::caret_identifier;
|
|
errorKind = "invalid block name";
|
|
break;
|
|
case '!':
|
|
kind = Token::exclamation_identifier;
|
|
errorKind = "invalid type identifier";
|
|
break;
|
|
default:
|
|
llvm_unreachable("invalid caller");
|
|
}
|
|
|
|
// Parse suffix-id.
|
|
if (isdigit(*curPtr)) {
|
|
// If suffix-id starts with a digit, the rest must be digits.
|
|
while (isdigit(*curPtr)) {
|
|
++curPtr;
|
|
}
|
|
} else if (isalpha(*curPtr) || isPunct(*curPtr)) {
|
|
do {
|
|
++curPtr;
|
|
} while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
|
|
} else {
|
|
return emitError(curPtr - 1, errorKind);
|
|
}
|
|
|
|
return formToken(kind, tokStart);
|
|
}
|
|
|
|
/// Lex a string literal.
|
|
///
|
|
/// string-literal ::= '"' [^"\n\f\v\r]* '"'
|
|
///
|
|
/// TODO: define escaping rules.
|
|
Token Lexer::lexString(const char *tokStart) {
|
|
assert(curPtr[-1] == '"');
|
|
|
|
while (true) {
|
|
switch (*curPtr++) {
|
|
case '"':
|
|
return formToken(Token::string, tokStart);
|
|
case 0:
|
|
// If this is a random nul character in the middle of a string, just
|
|
// include it. If it is the end of file, then it is an error.
|
|
if (curPtr - 1 != curBuffer.end())
|
|
continue;
|
|
LLVM_FALLTHROUGH;
|
|
case '\n':
|
|
case '\v':
|
|
case '\f':
|
|
return emitError(curPtr - 1, "expected '\"' in string literal");
|
|
case '\\':
|
|
// Handle explicitly a few escapes.
|
|
if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
|
|
++curPtr;
|
|
else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
|
|
// Support \xx for two hex digits.
|
|
curPtr += 2;
|
|
else
|
|
return emitError(curPtr - 1, "unknown escape in string literal");
|
|
continue;
|
|
|
|
default:
|
|
continue;
|
|
}
|
|
}
|
|
}
|