llvm-project/mlir/lib/Parser/Lexer.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

406 lines
11 KiB
C++
Raw Normal View History

//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the lexer for the MLIR textual form.
//
//===----------------------------------------------------------------------===//
#include "Lexer.h"
Introduce a new API for emitting diagnostics with Diagnostic and InFlightDiagnostic. The Diagnostic class contains all of the information necessary to report a diagnostic to the DiagnosticEngine. It should generally not be constructed directly, and instead used transitively via InFlightDiagnostic. A diagnostic is currently comprised of several different elements: * A severity level. * A source Location. * A list of DiagnosticArguments that help compose and comprise the output message. * A DiagnosticArgument represents any value that may be part of the diagnostic, e.g. string, integer, Type, Attribute, etc. * Arguments can be added to the diagnostic via the stream(<<) operator. * (In a future cl) A list of attached notes. * These are in the form of other diagnostics that provide supplemental information to the main diagnostic, but do not have context on their own. The InFlightDiagnostic class represents an RAII wrapper around a Diagnostic that is set to be reported with the diagnostic engine. This allows for the user to modify a diagnostic that is inflight. The internally wrapped diagnostic can be reported directly or automatically upon destruction. These classes allow for more natural composition of diagnostics by removing the restriction that the message of a diagnostic is comprised of a single Twine. They should also allow for nice incremental improvements to the diagnostics experience in the future, e.g. formatv style diagnostics. Simple Example: emitError(loc, "integer bitwidth is limited to " + Twine(IntegerType::kMaxWidth) + " bits"); emitError(loc) << "integer bitwidth is limited to " << IntegerType::kMaxWidth << " bits"; -- PiperOrigin-RevId: 246526439
2019-05-04 01:01:01 +08:00
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/Identifier.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/MLIRContext.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/SourceMgr.h"
using namespace mlir;
using llvm::SMLoc;
using llvm::SourceMgr;
// Returns true if 'c' is an allowable punctuation character: [$._-]
// Returns false otherwise.
static bool isPunct(char c) {
return c == '$' || c == '.' || c == '_' || c == '-';
}
Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
: sourceMgr(sourceMgr), context(context) {
auto bufferID = sourceMgr.getMainFileID();
curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
curPtr = curBuffer.begin();
}
/// Encode the specified source location information into an attribute for
/// attachment to the IR.
Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
auto &sourceMgr = getSourceMgr();
unsigned mainFileID = sourceMgr.getMainFileID();
// TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
// use it here.
auto &bufferInfo = sourceMgr.getBufferInfo(mainFileID);
unsigned lineNo = bufferInfo.getLineNumber(loc.getPointer());
unsigned column =
(loc.getPointer() - bufferInfo.getPointerForLineNumber(lineNo)) + 1;
auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,
column);
}
/// emitError - Emit an error message and return an Token::error token.
Token Lexer::emitError(const char *loc, const Twine &message) {
mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
message);
return formToken(Token::error, loc);
}
Token Lexer::lexToken() {
while (true) {
const char *tokStart = curPtr;
switch (*curPtr++) {
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error.
return emitError(tokStart, "unexpected character");
case ' ':
case '\t':
case '\n':
case '\r':
// Handle whitespace.
continue;
case '_':
// Handle bare identifiers.
return lexBareIdentifierOrKeyword(tokStart);
case 0:
// This may either be a nul character in the source file or may be the EOF
// marker that llvm::MemoryBuffer guarantees will be there.
if (curPtr - 1 == curBuffer.end())
return formToken(Token::eof, tokStart);
continue;
case ':':
return formToken(Token::colon, tokStart);
case ',':
return formToken(Token::comma, tokStart);
case '.':
return lexEllipsis(tokStart);
case '(':
return formToken(Token::l_paren, tokStart);
case ')':
return formToken(Token::r_paren, tokStart);
case '{':
return formToken(Token::l_brace, tokStart);
case '}':
return formToken(Token::r_brace, tokStart);
case '[':
return formToken(Token::l_square, tokStart);
case ']':
return formToken(Token::r_square, tokStart);
case '<':
return formToken(Token::less, tokStart);
case '>':
return formToken(Token::greater, tokStart);
case '=':
return formToken(Token::equal, tokStart);
case '+':
return formToken(Token::plus, tokStart);
case '*':
return formToken(Token::star, tokStart);
case '-':
if (*curPtr == '>') {
++curPtr;
return formToken(Token::arrow, tokStart);
}
return formToken(Token::minus, tokStart);
case '?':
return formToken(Token::question, tokStart);
case '/':
if (*curPtr == '/') {
skipComment();
continue;
}
return emitError(tokStart, "unexpected character");
case '@':
return lexAtIdentifier(tokStart);
case '!':
LLVM_FALLTHROUGH;
case '^':
LLVM_FALLTHROUGH;
case '#':
LLVM_FALLTHROUGH;
case '%':
return lexPrefixedIdentifier(tokStart);
case '"':
return lexString(tokStart);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return lexNumber(tokStart);
}
}
}
/// Lex an '@foo' identifier.
///
/// symbol-ref-id ::= `@` (bare-id | string-literal)
///
Token Lexer::lexAtIdentifier(const char *tokStart) {
char cur = *curPtr++;
// Try to parse a string literal, if present.
if (cur == '"') {
Token stringIdentifier = lexString(curPtr);
if (stringIdentifier.is(Token::error))
return stringIdentifier;
return formToken(Token::at_identifier, tokStart);
}
// Otherwise, these always start with a letter or underscore.
if (!isalpha(cur) && cur != '_')
return emitError(curPtr - 1,
"@ identifier expected to start with letter or '_'");
while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
*curPtr == '$' || *curPtr == '.')
++curPtr;
return formToken(Token::at_identifier, tokStart);
}
/// Lex a bare identifier or keyword that starts with a letter.
///
/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
[mlir] Add a signedness semantics bit to IntegerType Thus far IntegerType has been signless: a value of IntegerType does not have a sign intrinsically and it's up to the specific operation to decide how to interpret those bits. For example, std.addi does two's complement arithmetic, and std.divis/std.diviu treats the first bit as a sign. This design choice was made some time ago when we did't have lots of dialects and dialects were more rigid. Today we have much more extensible infrastructure and different dialect may want different modelling over integer signedness. So while we can say we want signless integers in the standard dialect, we cannot dictate for others. Requiring each dialect to model the signedness semantics with another set of custom types is duplicating the functionality everywhere, considering the fundamental role integer types play. This CL extends the IntegerType with a signedness semantics bit. This gives each dialect an option to opt in signedness semantics if that's what they want and helps code sharing. The parser is modified to recognize `si[1-9][0-9]*` and `ui[1-9][0-9]*` as signed and unsigned integer types, respectively, leaving the original `i[1-9][0-9]*` to continue to mean no indication over signedness semantics. All existing dialects are not affected (yet) as this is a feature to opt in. More discussions can be found at: https://groups.google.com/a/tensorflow.org/d/msg/mlir/XmkV8HOPWpo/7O4X0Nb_AQAJ Differential Revision: https://reviews.llvm.org/D72533
2020-01-11 03:48:24 +08:00
/// integer-type ::= `[su]?i[1-9][0-9]*`
///
Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
// Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
*curPtr == '$' || *curPtr == '.')
++curPtr;
// Check to see if this identifier is a keyword.
StringRef spelling(tokStart, curPtr - tokStart);
[mlir] Add a signedness semantics bit to IntegerType Thus far IntegerType has been signless: a value of IntegerType does not have a sign intrinsically and it's up to the specific operation to decide how to interpret those bits. For example, std.addi does two's complement arithmetic, and std.divis/std.diviu treats the first bit as a sign. This design choice was made some time ago when we did't have lots of dialects and dialects were more rigid. Today we have much more extensible infrastructure and different dialect may want different modelling over integer signedness. So while we can say we want signless integers in the standard dialect, we cannot dictate for others. Requiring each dialect to model the signedness semantics with another set of custom types is duplicating the functionality everywhere, considering the fundamental role integer types play. This CL extends the IntegerType with a signedness semantics bit. This gives each dialect an option to opt in signedness semantics if that's what they want and helps code sharing. The parser is modified to recognize `si[1-9][0-9]*` and `ui[1-9][0-9]*` as signed and unsigned integer types, respectively, leaving the original `i[1-9][0-9]*` to continue to mean no indication over signedness semantics. All existing dialects are not affected (yet) as this is a feature to opt in. More discussions can be found at: https://groups.google.com/a/tensorflow.org/d/msg/mlir/XmkV8HOPWpo/7O4X0Nb_AQAJ Differential Revision: https://reviews.llvm.org/D72533
2020-01-11 03:48:24 +08:00
auto isAllDigit = [](StringRef str) {
return llvm::all_of(str, [](char c) { return llvm::isDigit(c); });
};
// Check for i123, si456, ui789.
if ((spelling.size() > 1 && tokStart[0] == 'i' &&
isAllDigit(spelling.drop_front())) ||
((spelling.size() > 2 && tokStart[1] == 'i' &&
(tokStart[0] == 's' || tokStart[0] == 'u')) &&
isAllDigit(spelling.drop_front(2))))
return Token(Token::inttype, spelling);
Token::Kind kind = StringSwitch<Token::Kind>(spelling)
#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
#include "TokenKinds.def"
.Default(Token::bare_identifier);
return Token(kind, spelling);
}
/// Skip a comment line, starting with a '//'.
///
/// TODO: add a regex for comments here and to the spec.
///
void Lexer::skipComment() {
// Advance over the second '/' in a '//' comment.
assert(*curPtr == '/');
++curPtr;
while (true) {
switch (*curPtr++) {
case '\n':
case '\r':
// Newline is end of comment.
return;
case 0:
// If this is the end of the buffer, end the comment.
if (curPtr - 1 == curBuffer.end()) {
--curPtr;
return;
}
LLVM_FALLTHROUGH;
default:
// Skip over other characters.
break;
}
}
}
/// Lex an ellipsis.
///
/// ellipsis ::= '...'
///
Token Lexer::lexEllipsis(const char *tokStart) {
assert(curPtr[-1] == '.');
if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
return emitError(curPtr, "expected three consecutive dots for an ellipsis");
curPtr += 2;
return formToken(Token::ellipsis, tokStart);
}
/// Lex a number literal.
///
/// integer-literal ::= digit+ | `0x` hex_digit+
/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
///
Token Lexer::lexNumber(const char *tokStart) {
assert(isdigit(curPtr[-1]));
// Handle the hexadecimal case.
if (curPtr[-1] == '0' && *curPtr == 'x') {
// If we see stuff like 0xi32, this is a literal `0` followed by an
// identifier `xi32`, stop after `0`.
if (!isxdigit(curPtr[1]))
return formToken(Token::integer, tokStart);
curPtr += 2;
while (isxdigit(*curPtr))
++curPtr;
return formToken(Token::integer, tokStart);
}
// Handle the normal decimal case.
while (isdigit(*curPtr))
++curPtr;
if (*curPtr != '.')
return formToken(Token::integer, tokStart);
++curPtr;
// Skip over [0-9]*([eE][-+]?[0-9]+)?
while (isdigit(*curPtr))
++curPtr;
if (*curPtr == 'e' || *curPtr == 'E') {
if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
((curPtr[1] == '-' || curPtr[1] == '+') &&
isdigit(static_cast<unsigned char>(curPtr[2])))) {
curPtr += 2;
while (isdigit(*curPtr))
++curPtr;
}
}
return formToken(Token::floatliteral, tokStart);
}
/// Lex an identifier that starts with a prefix followed by suffix-id.
///
/// attribute-id ::= `#` suffix-id
/// ssa-id ::= '%' suffix-id
/// block-id ::= '^' suffix-id
/// type-id ::= '!' suffix-id
/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
/// id-punct ::= `$` | `.` | `_` | `-`
///
Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
Token::Kind kind;
StringRef errorKind;
switch (*tokStart) {
case '#':
kind = Token::hash_identifier;
errorKind = "invalid attribute name";
break;
case '%':
kind = Token::percent_identifier;
errorKind = "invalid SSA name";
break;
case '^':
kind = Token::caret_identifier;
errorKind = "invalid block name";
break;
case '!':
kind = Token::exclamation_identifier;
errorKind = "invalid type identifier";
break;
default:
llvm_unreachable("invalid caller");
}
// Parse suffix-id.
if (isdigit(*curPtr)) {
// If suffix-id starts with a digit, the rest must be digits.
while (isdigit(*curPtr)) {
++curPtr;
}
} else if (isalpha(*curPtr) || isPunct(*curPtr)) {
do {
++curPtr;
} while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
} else {
return emitError(curPtr - 1, errorKind);
}
return formToken(kind, tokStart);
}
/// Lex a string literal.
///
/// string-literal ::= '"' [^"\n\f\v\r]* '"'
///
/// TODO: define escaping rules.
Token Lexer::lexString(const char *tokStart) {
assert(curPtr[-1] == '"');
while (true) {
switch (*curPtr++) {
case '"':
return formToken(Token::string, tokStart);
case 0:
// If this is a random nul character in the middle of a string, just
// include it. If it is the end of file, then it is an error.
if (curPtr - 1 != curBuffer.end())
continue;
LLVM_FALLTHROUGH;
case '\n':
case '\v':
case '\f':
return emitError(curPtr - 1, "expected '\"' in string literal");
case '\\':
// Handle explicitly a few escapes.
if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
++curPtr;
else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
// Support \xx for two hex digits.
curPtr += 2;
else
return emitError(curPtr - 1, "unknown escape in string literal");
continue;
default:
continue;
}
}
}