forked from OSchip/llvm-project
Add .rc scripts tokenizer.
This extends the shell of llvm-rc tool with the ability of tokenization of the input files. Currently, ASCII and ASCII-compatible UTF-8 files are supported. Thanks to Nico Weber (thakis) for his original work in this area. Differential Revision: https://reviews.llvm.org/D35957 llvm-svn: 310621
This commit is contained in:
parent
27fbd1e14a
commit
719e22d4f4
|
@ -0,0 +1,8 @@
|
|||
1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
|
||||
He11o LLVM
|
||||
|
||||
"RC string test.",L"Another RC string test.'&{",42,100
|
||||
|
||||
|
||||
|
||||
":))"
|
|
@ -0,0 +1,35 @@
|
|||
; RUN: llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s
|
||||
|
||||
; CHECK: Int: 1; int value = 1
|
||||
; CHECK-NEXT: Plus: +
|
||||
; CHECK-NEXT: Int: 2; int value = 2
|
||||
; CHECK-NEXT: Minus: -
|
||||
; CHECK-NEXT: Int: 3214L; int value = 3214
|
||||
; CHECK-NEXT: Amp: &
|
||||
; CHECK-NEXT: Int: 0x120894; int value = 1181844
|
||||
; CHECK-NEXT: Int: 032173; int value = 13435
|
||||
; CHECK-NEXT: Int: 2; int value = 2
|
||||
; CHECK-NEXT: Pipe: |
|
||||
; CHECK-NEXT: Amp: &
|
||||
; CHECK-NEXT: Tilde: ~
|
||||
; CHECK-NEXT: Plus: +
|
||||
; CHECK-NEXT: LeftParen: (
|
||||
; CHECK-NEXT: Minus: -
|
||||
; CHECK-NEXT: Int: 7; int value = 7
|
||||
; CHECK-NEXT: RightParen: )
|
||||
; CHECK-NEXT: BlockBegin: {
|
||||
; CHECK-NEXT: Int: 0xabcdef; int value = 11259375
|
||||
; CHECK-NEXT: Int: 0xABCDEFl; int value = 11259375
|
||||
; CHECK-NEXT: BlockEnd: }
|
||||
; CHECK-NEXT: BlockBegin: Begin
|
||||
; CHECK-NEXT: BlockEnd: End
|
||||
; CHECK-NEXT: Identifier: He11o
|
||||
; CHECK-NEXT: Identifier: LLVM
|
||||
; CHECK-NEXT: String: "RC string test."
|
||||
; CHECK-NEXT: Comma: ,
|
||||
; CHECK-NEXT: String: L"Another RC string test.'&{"
|
||||
; CHECK-NEXT: Comma: ,
|
||||
; CHECK-NEXT: Int: 42; int value = 42
|
||||
; CHECK-NEXT: Comma: ,
|
||||
; CHECK-NEXT: Int: 100; int value = 100
|
||||
; CHECK-NEXT: String: ":))"
|
|
@ -10,4 +10,5 @@ add_public_tablegen_target(RcTableGen)
|
|||
|
||||
add_llvm_tool(llvm-rc
|
||||
llvm-rc.cpp
|
||||
ResourceScriptToken.cpp
|
||||
)
|
||||
|
|
|
@ -0,0 +1,296 @@
|
|||
//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements an interface defined in ResourceScriptToken.h.
|
||||
// In particular, it defines an .rc script tokenizer.
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
#include "ResourceScriptToken.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cstdlib>
|
||||
#include <utility>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
using Kind = RCToken::Kind;
|
||||
|
||||
// Checks if Representation is a correct description of an RC integer.
|
||||
// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
|
||||
// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
|
||||
// character (that is the difference between our representation and
|
||||
// StringRef's one). If Representation is correct, 'true' is returned and
|
||||
// the return value is put back in Num.
|
||||
static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
|
||||
size_t Length = Representation.size();
|
||||
if (Length == 0)
|
||||
return false;
|
||||
// Strip the last 'L' if unnecessary.
|
||||
if (std::toupper(Representation.back()) == 'L')
|
||||
Representation = Representation.drop_back(1);
|
||||
|
||||
return !Representation.getAsInteger<uint32_t>(0, Num);
|
||||
}
|
||||
|
||||
RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
|
||||
: TokenKind(RCTokenKind), TokenValue(Value) {}
|
||||
|
||||
uint32_t RCToken::intValue() const {
|
||||
assert(TokenKind == Kind::Int);
|
||||
// We assume that the token already is a correct integer (checked by
|
||||
// rcGetAsInteger).
|
||||
uint32_t Result;
|
||||
bool IsSuccess = rcGetAsInteger(TokenValue, Result);
|
||||
assert(IsSuccess);
|
||||
(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
|
||||
return Result;
|
||||
}
|
||||
|
||||
StringRef RCToken::value() const { return TokenValue; }
|
||||
|
||||
Kind RCToken::kind() const { return TokenKind; }
|
||||
|
||||
static Error getStringError(const Twine &message) {
|
||||
return make_error<StringError>("Error parsing file: " + message,
|
||||
inconvertibleErrorCode());
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
class Tokenizer {
|
||||
public:
|
||||
Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
|
||||
|
||||
Expected<std::vector<RCToken>> run();
|
||||
|
||||
private:
|
||||
// All 'advancing' methods return boolean values; if they're equal to false,
|
||||
// the stream has ended or failed.
|
||||
bool advance(size_t Amount = 1);
|
||||
bool skipWhitespaces();
|
||||
|
||||
// Consumes a token. If any problem occurred, a non-empty Error is returned.
|
||||
Error consumeToken(const Kind TokenKind);
|
||||
|
||||
// Check if tokenizer is about to read FollowingChars.
|
||||
bool willNowRead(StringRef FollowingChars) const;
|
||||
|
||||
// Check if tokenizer can start reading an identifier at current position.
|
||||
// The original tool did non specify the rules to determine what is a correct
|
||||
// identifier. We assume they should follow the C convention:
|
||||
// [a-zA-z_][a-zA-Z0-9_]*.
|
||||
bool canStartIdentifier() const;
|
||||
// Check if tokenizer can continue reading an identifier.
|
||||
bool canContinueIdentifier() const;
|
||||
|
||||
// Check if tokenizer can start reading an integer.
|
||||
// A correct integer always starts with a 0-9 digit,
|
||||
// can contain characters 0-9A-Fa-f (digits),
|
||||
// Ll (marking the integer is 32-bit), Xx (marking the representation
|
||||
// is hexadecimal). As some kind of separator should come after the
|
||||
// integer, we can consume the integer until a non-alphanumeric
|
||||
// character.
|
||||
bool canStartInt() const;
|
||||
bool canContinueInt() const;
|
||||
|
||||
bool canStartString() const;
|
||||
|
||||
bool streamEof() const;
|
||||
|
||||
// Classify the token that is about to be read from the current position.
|
||||
Kind classifyCurrentToken() const;
|
||||
|
||||
// Process the Kind::Identifier token - check if it is
|
||||
// an identifier describing a block start or end.
|
||||
void processIdentifier(RCToken &token) const;
|
||||
|
||||
StringRef Data;
|
||||
size_t DataLength, Pos;
|
||||
};
|
||||
|
||||
Expected<std::vector<RCToken>> Tokenizer::run() {
|
||||
Pos = 0;
|
||||
std::vector<RCToken> Result;
|
||||
|
||||
// Consume an optional UTF-8 Byte Order Mark.
|
||||
if (willNowRead("\xef\xbb\xbf"))
|
||||
advance(3);
|
||||
|
||||
while (!streamEof()) {
|
||||
if (!skipWhitespaces())
|
||||
break;
|
||||
|
||||
Kind TokenKind = classifyCurrentToken();
|
||||
if (TokenKind == Kind::Invalid)
|
||||
return getStringError("Invalid token found at position " + Twine(Pos));
|
||||
|
||||
const size_t TokenStart = Pos;
|
||||
if (Error TokenError = consumeToken(TokenKind))
|
||||
return std::move(TokenError);
|
||||
|
||||
RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
|
||||
if (TokenKind == Kind::Identifier) {
|
||||
processIdentifier(Token);
|
||||
} else if (TokenKind == Kind::Int) {
|
||||
uint32_t TokenInt;
|
||||
if (!rcGetAsInteger(Token.value(), TokenInt)) {
|
||||
// The integer has incorrect format or cannot be represented in
|
||||
// a 32-bit integer.
|
||||
return getStringError("Integer invalid or too large: " +
|
||||
Token.value().str());
|
||||
}
|
||||
}
|
||||
|
||||
Result.push_back(Token);
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
bool Tokenizer::advance(size_t Amount) {
|
||||
Pos += Amount;
|
||||
return !streamEof();
|
||||
}
|
||||
|
||||
bool Tokenizer::skipWhitespaces() {
|
||||
while (!streamEof() && std::isspace(Data[Pos]))
|
||||
advance();
|
||||
return !streamEof();
|
||||
}
|
||||
|
||||
Error Tokenizer::consumeToken(const Kind TokenKind) {
|
||||
switch (TokenKind) {
|
||||
// One-character token consumption.
|
||||
#define TOKEN(Name)
|
||||
#define SHORT_TOKEN(Name, Ch) case Kind::Name:
|
||||
#include "ResourceScriptTokenList.h"
|
||||
#undef TOKEN
|
||||
#undef SHORT_TOKEN
|
||||
advance();
|
||||
return Error::success();
|
||||
|
||||
case Kind::Identifier:
|
||||
while (!streamEof() && canContinueIdentifier())
|
||||
advance();
|
||||
return Error::success();
|
||||
|
||||
case Kind::Int:
|
||||
while (!streamEof() && canContinueInt())
|
||||
advance();
|
||||
return Error::success();
|
||||
|
||||
case Kind::String:
|
||||
// Consume the preceding 'L', if there is any.
|
||||
if (std::toupper(Data[Pos]) == 'L')
|
||||
advance();
|
||||
// Consume the double-quote.
|
||||
advance();
|
||||
|
||||
// Consume the characters until the end of the file, line or string.
|
||||
while (true) {
|
||||
if (streamEof()) {
|
||||
return getStringError("Unterminated string literal.");
|
||||
} else if (Data[Pos] == '"') {
|
||||
// Consume the ending double-quote.
|
||||
advance();
|
||||
return Error::success();
|
||||
} else if (Data[Pos] == '\n') {
|
||||
return getStringError("String literal not terminated in the line.");
|
||||
}
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
case Kind::Invalid:
|
||||
assert(false && "Cannot consume an invalid token.");
|
||||
}
|
||||
}
|
||||
|
||||
bool Tokenizer::willNowRead(StringRef FollowingChars) const {
|
||||
return Data.drop_front(Pos).startswith(FollowingChars);
|
||||
}
|
||||
|
||||
bool Tokenizer::canStartIdentifier() const {
|
||||
assert(!streamEof());
|
||||
|
||||
const char CurChar = Data[Pos];
|
||||
return std::isalpha(CurChar) || CurChar == '_';
|
||||
}
|
||||
|
||||
bool Tokenizer::canContinueIdentifier() const {
|
||||
assert(!streamEof());
|
||||
const char CurChar = Data[Pos];
|
||||
return std::isalnum(CurChar) || CurChar == '_';
|
||||
}
|
||||
|
||||
bool Tokenizer::canStartInt() const {
|
||||
assert(!streamEof());
|
||||
return std::isdigit(Data[Pos]);
|
||||
}
|
||||
|
||||
bool Tokenizer::canContinueInt() const {
|
||||
assert(!streamEof());
|
||||
return std::isalnum(Data[Pos]);
|
||||
}
|
||||
|
||||
bool Tokenizer::canStartString() const {
|
||||
return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
|
||||
}
|
||||
|
||||
bool Tokenizer::streamEof() const { return Pos == DataLength; }
|
||||
|
||||
Kind Tokenizer::classifyCurrentToken() const {
|
||||
if (canStartInt())
|
||||
return Kind::Int;
|
||||
if (canStartString())
|
||||
return Kind::String;
|
||||
// BEGIN and END are at this point of lexing recognized as identifiers.
|
||||
if (canStartIdentifier())
|
||||
return Kind::Identifier;
|
||||
|
||||
const char CurChar = Data[Pos];
|
||||
|
||||
switch (CurChar) {
|
||||
// One-character token classification.
|
||||
#define TOKEN(Name)
|
||||
#define SHORT_TOKEN(Name, Ch) \
|
||||
case Ch: \
|
||||
return Kind::Name;
|
||||
#include "ResourceScriptTokenList.h"
|
||||
#undef TOKEN
|
||||
#undef SHORT_TOKEN
|
||||
|
||||
default:
|
||||
return Kind::Invalid;
|
||||
}
|
||||
}
|
||||
|
||||
void Tokenizer::processIdentifier(RCToken &Token) const {
|
||||
assert(Token.kind() == Kind::Identifier);
|
||||
StringRef Name = Token.value();
|
||||
|
||||
if (Name.equals_lower("begin"))
|
||||
Token = RCToken(Kind::BlockBegin, Name);
|
||||
else if (Name.equals_lower("end"))
|
||||
Token = RCToken(Kind::BlockEnd, Name);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
namespace llvm {
|
||||
|
||||
Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
|
||||
return Tokenizer(Input).run();
|
||||
}
|
||||
|
||||
} // namespace llvm
|
|
@ -0,0 +1,81 @@
|
|||
//===-- ResourceScriptToken.h -----------------------------------*- C++-*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
//
|
||||
// This declares the .rc script tokens and defines an interface for tokenizing
|
||||
// the input data. The list of available tokens is located at
|
||||
// ResourceScriptTokenList.h.
|
||||
//
|
||||
// Note that the tokenizer does not support comments or preprocessor
|
||||
// directives. The preprocessor should do its work on the .rc file before
|
||||
// running llvm-rc.
|
||||
//
|
||||
// As for now, it is possible to parse ASCII files only (the behavior on
|
||||
// UTF files might be undefined). However, it already consumes UTF-8 BOM, if
|
||||
// there is any. Thus, ASCII-compatible UTF-8 files are tokenized correctly.
|
||||
//
|
||||
// Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380599(v=vs.85).aspx
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
|
||||
#define LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
// A definition of a single resource script token. Each token has its kind
|
||||
// (declared in ResourceScriptTokenList) and holds a value - a reference
|
||||
// representation of the token.
|
||||
// RCToken does not claim ownership on its value. A memory buffer containing
|
||||
// the token value should be stored in a safe place and cannot be freed
|
||||
// nor reallocated.
|
||||
class RCToken {
|
||||
public:
|
||||
enum class Kind {
|
||||
#define TOKEN(Name) Name,
|
||||
#define SHORT_TOKEN(Name, Ch) Name,
|
||||
#include "ResourceScriptTokenList.h"
|
||||
#undef TOKEN
|
||||
#undef SHORT_TOKEN
|
||||
};
|
||||
|
||||
RCToken(RCToken::Kind RCTokenKind, StringRef Value);
|
||||
|
||||
// Get an integer value of the integer token.
|
||||
uint32_t intValue() const;
|
||||
|
||||
StringRef value() const;
|
||||
Kind kind() const;
|
||||
|
||||
private:
|
||||
Kind TokenKind;
|
||||
StringRef TokenValue;
|
||||
};
|
||||
|
||||
// Tokenize Input.
|
||||
// In case no error occured, the return value contains
|
||||
// tokens in order they were in the input file.
|
||||
// In case of any error, the return value contains
|
||||
// a textual representation of error.
|
||||
//
|
||||
// Tokens returned by this function hold only references to the parts
|
||||
// of the Input. Memory buffer containing Input cannot be freed,
|
||||
// modified or reallocated.
|
||||
Expected<std::vector<RCToken>> tokenizeRC(StringRef Input);
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -0,0 +1,35 @@
|
|||
//===-- ResourceScriptTokenList.h -------------------------------*- C++-*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
//
|
||||
// This is a part of llvm-rc tokenizer. It lists all the possible tokens
|
||||
// that might occur in a correct .rc script.
|
||||
//
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
|
||||
// Long tokens. They might consist of more than one character.
|
||||
TOKEN(Invalid) // Invalid token. Should not occur in a valid script.
|
||||
TOKEN(Int) // Integer (decimal, octal or hexadecimal).
|
||||
TOKEN(String) // String value.
|
||||
TOKEN(Identifier) // Script identifier (resource name or type).
|
||||
|
||||
// Short tokens. They usually consist of exactly one character.
|
||||
// The definitions are of the form SHORT_TOKEN(TokenName, TokenChar).
|
||||
// TokenChar is the one-character token representation occuring in the correct
|
||||
// .rc scripts.
|
||||
SHORT_TOKEN(BlockBegin, '{') // Start of the script block; can also be BEGIN.
|
||||
SHORT_TOKEN(BlockEnd, '}') // End of the block; can also be END.
|
||||
SHORT_TOKEN(Comma, ',') // Comma - resource arguments separator.
|
||||
SHORT_TOKEN(Plus, '+') // Addition operator.
|
||||
SHORT_TOKEN(Minus, '-') // Subtraction operator.
|
||||
SHORT_TOKEN(Pipe, '|') // Bitwise-OR operator.
|
||||
SHORT_TOKEN(Amp, '&') // Bitwise-AND operator.
|
||||
SHORT_TOKEN(Tilde, '~') // Bitwise-NOT operator.
|
||||
SHORT_TOKEN(LeftParen, '(') // Left parenthesis in the script expressions.
|
||||
SHORT_TOKEN(RightParen, ')') // Right parenthesis.
|
|
@ -1,4 +1,4 @@
|
|||
//===- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*--===//
|
||||
//===-- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
|
@ -12,6 +12,8 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ResourceScriptToken.h"
|
||||
|
||||
#include "llvm/Option/Arg.h"
|
||||
#include "llvm/Option/ArgList.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
|
@ -60,6 +62,12 @@ public:
|
|||
};
|
||||
|
||||
static ExitOnError ExitOnErr;
|
||||
|
||||
LLVM_ATTRIBUTE_NORETURN static void fatalError(Twine Message) {
|
||||
errs() << Message << "\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc_, const char *argv_[]) {
|
||||
|
@ -81,8 +89,49 @@ int main(int argc_, const char *argv_[]) {
|
|||
opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
|
||||
|
||||
// The tool prints nothing when invoked with no command-line arguments.
|
||||
if (InputArgs.hasArg(OPT_HELP))
|
||||
if (InputArgs.hasArg(OPT_HELP)) {
|
||||
T.PrintHelp(outs(), "rc", "Resource Converter", false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE);
|
||||
|
||||
std::vector<std::string> InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);
|
||||
if (InArgsInfo.size() != 1) {
|
||||
fatalError("Exactly one input file should be provided.");
|
||||
}
|
||||
|
||||
// Read and tokenize the input file.
|
||||
const Twine &Filename = InArgsInfo[0];
|
||||
ErrorOr<std::unique_ptr<MemoryBuffer>> File = MemoryBuffer::getFile(Filename);
|
||||
if (!File) {
|
||||
fatalError("Error opening file '" + Filename +
|
||||
"': " + File.getError().message());
|
||||
}
|
||||
|
||||
std::unique_ptr<MemoryBuffer> FileContents = std::move(*File);
|
||||
StringRef Contents = FileContents->getBuffer();
|
||||
|
||||
std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(Contents));
|
||||
|
||||
if (BeVerbose) {
|
||||
const Twine TokenNames[] = {
|
||||
#define TOKEN(Name) #Name,
|
||||
#define SHORT_TOKEN(Name, Ch) #Name,
|
||||
#include "ResourceScriptTokenList.h"
|
||||
#undef TOKEN
|
||||
#undef SHORT_TOKEN
|
||||
};
|
||||
|
||||
for (const RCToken &Token : Tokens) {
|
||||
outs() << TokenNames[static_cast<int>(Token.kind())] << ": "
|
||||
<< Token.value();
|
||||
if (Token.kind() == RCToken::Kind::Int)
|
||||
outs() << "; int value = " << Token.intValue();
|
||||
|
||||
outs() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue