MIR Serialization: Introduce a lexer for machine instructions.

This commit adds a function that tokenizes the string containing
the machine instruction. This commit also adds a struct called 
'MIToken' which is used to represent the lexer's tokens.

Reviewers: Sean Silva

Differential Revision: http://reviews.llvm.org/D10521

llvm-svn: 240323
This commit is contained in:
Alex Lorenz 2015-06-22 20:37:46 +00:00
parent f22855079a
commit 91370c5d62
7 changed files with 230 additions and 12 deletions

View File

@ -1,4 +1,5 @@
add_llvm_library(LLVMMIRParser
MILexer.cpp
MIParser.cpp
MIRParser.cpp
)

View File

@ -0,0 +1,87 @@
//===- MILexer.cpp - Machine instructions lexer implementation ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the lexing of machine instructions.
//
//===----------------------------------------------------------------------===//
#include "MILexer.h"
#include "llvm/ADT/Twine.h"
#include <cctype>
using namespace llvm;
namespace {
/// This class provides a way to iterate and get characters from the source
/// string.
class Cursor {
const char *Ptr;
const char *End;
public:
explicit Cursor(StringRef Str) {
Ptr = Str.data();
End = Ptr + Str.size();
}
bool isEOF() const { return Ptr == End; }
char peek() const { return isEOF() ? 0 : *Ptr; }
void advance() { ++Ptr; }
StringRef remaining() const { return StringRef(Ptr, End - Ptr); }
StringRef upto(Cursor C) const {
assert(C.Ptr >= Ptr && C.Ptr <= End);
return StringRef(Ptr, C.Ptr - Ptr);
}
StringRef::iterator location() const { return Ptr; }
};
} // end anonymous namespace
/// Skip the leading whitespace characters and return the updated cursor.
static Cursor skipWhitespace(Cursor C) {
while (isspace(C.peek()))
C.advance();
return C;
}
static bool isIdentifierChar(char C) {
return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.';
}
static Cursor lexIdentifier(Cursor C, MIToken &Token) {
auto Range = C;
while (isIdentifierChar(C.peek()))
C.advance();
Token = MIToken(MIToken::Identifier, Range.upto(C));
return C;
}
StringRef llvm::lexMIToken(
StringRef Source, MIToken &Token,
function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
auto C = skipWhitespace(Cursor(Source));
if (C.isEOF()) {
Token = MIToken(MIToken::Eof, C.remaining());
return C.remaining();
}
auto Char = C.peek();
if (isalpha(Char) || Char == '_')
return lexIdentifier(C, Token).remaining();
Token = MIToken(MIToken::Error, C.remaining());
ErrorCallback(C.location(),
Twine("unexpected character '") + Twine(Char) + "'");
return C.remaining();
}

View File

@ -0,0 +1,65 @@
//===- MILexer.h - Lexer for machine instructions -------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file declares the function that lexes the machine instruction source
// string.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
#define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/STLExtras.h"
#include <functional>
namespace llvm {
class Twine;
/// A token produced by the machine instruction lexer.
struct MIToken {
enum TokenKind {
// Markers
Eof,
Error,
// Identifier tokens
Identifier
};
private:
TokenKind Kind;
StringRef Range;
public:
MIToken(TokenKind Kind, StringRef Range) : Kind(Kind), Range(Range) {}
TokenKind kind() const { return Kind; }
bool isError() const { return Kind == Error; }
bool is(TokenKind K) const { return Kind == K; }
bool isNot(TokenKind K) const { return Kind != K; }
StringRef::iterator location() const { return Range.begin(); }
StringRef stringValue() const { return Range; }
};
/// Consume a single machine instruction token in the given source and return
/// the remaining source string.
StringRef lexMIToken(
StringRef Source, MIToken &Token,
function_ref<void(StringRef::iterator, const Twine &)> ErrorCallback);
} // end namespace llvm
#endif

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "MIParser.h"
#include "MILexer.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@ -29,7 +30,8 @@ class MIParser {
SourceMgr &SM;
MachineFunction &MF;
SMDiagnostic &Error;
StringRef Source;
StringRef Source, CurrentSource;
MIToken Token;
/// Maps from instruction names to op codes.
StringMap<unsigned> Names2InstrOpCodes;
@ -37,11 +39,18 @@ public:
MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
StringRef Source);
void lex();
/// Report an error at the current location with the given message.
///
/// This function always return true.
bool error(const Twine &Msg);
/// Report an error at the given location with the given message.
///
/// This function always return true.
bool error(StringRef::iterator Loc, const Twine &Msg);
MachineInstr *parse();
private:
@ -50,31 +59,42 @@ private:
/// Try to convert an instruction name to an opcode. Return true if the
/// instruction name is invalid.
bool parseInstrName(StringRef InstrName, unsigned &OpCode);
bool parseInstruction(unsigned &OpCode);
};
} // end anonymous namespace
MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
StringRef Source)
: SM(SM), MF(MF), Error(Error), Source(Source) {}
: SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source),
Token(MIToken::Error, StringRef()) {}
bool MIParser::error(const Twine &Msg) {
void MIParser::lex() {
CurrentSource = lexMIToken(
CurrentSource, Token,
[this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); });
}
bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
// TODO: Get the proper location in the MIR file, not just a location inside
// the string.
Error =
SMDiagnostic(SM, SMLoc(), SM.getMemoryBuffer(SM.getMainFileID())
->getBufferIdentifier(),
1, 0, SourceMgr::DK_Error, Msg.str(), Source, None, None);
assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
Error = SMDiagnostic(
SM, SMLoc(),
SM.getMemoryBuffer(SM.getMainFileID())->getBufferIdentifier(), 1,
Loc - Source.data(), SourceMgr::DK_Error, Msg.str(), Source, None, None);
return true;
}
MachineInstr *MIParser::parse() {
StringRef InstrName = Source;
lex();
unsigned OpCode;
if (parseInstrName(InstrName, OpCode)) {
error(Twine("unknown machine instruction name '") + InstrName + "'");
if (Token.isError() || parseInstruction(OpCode))
return nullptr;
}
// TODO: Parse the rest of instruction - machine operands, etc.
const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
@ -82,6 +102,15 @@ MachineInstr *MIParser::parse() {
return MI;
}
bool MIParser::parseInstruction(unsigned &OpCode) {
if (Token.isNot(MIToken::Identifier))
return error("expected a machine instruction");
StringRef InstrName = Token.stringValue();
if (parseInstrName(InstrName, OpCode))
return error(Twine("unknown machine instruction name '") + InstrName + "'");
return false;
}
void MIParser::initNames2InstrOpCodes() {
if (!Names2InstrOpCodes.empty())
return;

View File

@ -20,5 +20,5 @@ body:
# CHECK: - IMUL32rri8
# CHECK-NEXT: - RETQ
- IMUL32rri8
- RETQ
- ' RETQ '
...

View File

@ -0,0 +1,18 @@
# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
--- |
define void @foo() {
entry:
ret void
}
...
---
name: foo
body:
- name: entry
instructions:
# CHECK: 1:1: expected a machine instruction
- ''
...

View File

@ -0,0 +1,18 @@
# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
--- |
define void @foo() {
entry:
ret void
}
...
---
name: foo
body:
- name: entry
instructions:
# CHECK: 1:1: unexpected character '`'
- '` RETQ'
...