From 719e22d4f49c37dbfb4bd2fbcbea03c33ad1200a Mon Sep 17 00:00:00 2001 From: Marek Sokolowski Date: Thu, 10 Aug 2017 16:21:44 +0000 Subject: [PATCH] Add .rc scripts tokenizer. This extends the shell of llvm-rc tool with the ability of tokenization of the input files. Currently, ASCII and ASCII-compatible UTF-8 files are supported. Thanks to Nico Weber (thakis) for his original work in this area. Differential Revision: https://reviews.llvm.org/D35957 llvm-svn: 310621 --- llvm/test/tools/llvm-rc/Inputs/tokens.rc | 8 + llvm/test/tools/llvm-rc/tokenizer.test | 35 +++ llvm/tools/llvm-rc/CMakeLists.txt | 1 + llvm/tools/llvm-rc/ResourceScriptToken.cpp | 296 +++++++++++++++++++ llvm/tools/llvm-rc/ResourceScriptToken.h | 81 +++++ llvm/tools/llvm-rc/ResourceScriptTokenList.h | 35 +++ llvm/tools/llvm-rc/llvm-rc.cpp | 53 +++- 7 files changed, 507 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-rc/Inputs/tokens.rc create mode 100644 llvm/test/tools/llvm-rc/tokenizer.test create mode 100644 llvm/tools/llvm-rc/ResourceScriptToken.cpp create mode 100644 llvm/tools/llvm-rc/ResourceScriptToken.h create mode 100644 llvm/tools/llvm-rc/ResourceScriptTokenList.h diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc new file mode 100644 index 000000000000..20619149bb02 --- /dev/null +++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc @@ -0,0 +1,8 @@ +1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End +He11o LLVM + +"RC string test.",L"Another RC string test.'&{",42,100 + + + + ":))" diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test new file mode 100644 index 000000000000..789b313efadb --- /dev/null +++ b/llvm/test/tools/llvm-rc/tokenizer.test @@ -0,0 +1,35 @@ +; RUN: llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s + +; CHECK: Int: 1; int value = 1 +; CHECK-NEXT: Plus: + +; CHECK-NEXT: Int: 2; int value = 2 +; CHECK-NEXT: Minus: - +; CHECK-NEXT: Int: 3214L; int value = 3214 +; CHECK-NEXT: Amp: & +; CHECK-NEXT: Int: 0x120894; int value = 1181844 +; CHECK-NEXT: Int: 032173; int value = 13435 +; CHECK-NEXT: Int: 2; int value = 2 +; CHECK-NEXT: Pipe: | +; CHECK-NEXT: Amp: & +; CHECK-NEXT: Tilde: ~ +; CHECK-NEXT: Plus: + +; CHECK-NEXT: LeftParen: ( +; CHECK-NEXT: Minus: - +; CHECK-NEXT: Int: 7; int value = 7 +; CHECK-NEXT: RightParen: ) +; CHECK-NEXT: BlockBegin: { +; CHECK-NEXT: Int: 0xabcdef; int value = 11259375 +; CHECK-NEXT: Int: 0xABCDEFl; int value = 11259375 +; CHECK-NEXT: BlockEnd: } +; CHECK-NEXT: BlockBegin: Begin +; CHECK-NEXT: BlockEnd: End +; CHECK-NEXT: Identifier: He11o +; CHECK-NEXT: Identifier: LLVM +; CHECK-NEXT: String: "RC string test." +; CHECK-NEXT: Comma: , +; CHECK-NEXT: String: L"Another RC string test.'&{" +; CHECK-NEXT: Comma: , +; CHECK-NEXT: Int: 42; int value = 42 +; CHECK-NEXT: Comma: , +; CHECK-NEXT: Int: 100; int value = 100 +; CHECK-NEXT: String: ":))" diff --git a/llvm/tools/llvm-rc/CMakeLists.txt b/llvm/tools/llvm-rc/CMakeLists.txt index 40f88bcf3182..0e254516d740 100644 --- a/llvm/tools/llvm-rc/CMakeLists.txt +++ b/llvm/tools/llvm-rc/CMakeLists.txt @@ -10,4 +10,5 @@ add_public_tablegen_target(RcTableGen) add_llvm_tool(llvm-rc llvm-rc.cpp + ResourceScriptToken.cpp ) diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.cpp b/llvm/tools/llvm-rc/ResourceScriptToken.cpp new file mode 100644 index 000000000000..95ff113c70cc --- /dev/null +++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp @@ -0,0 +1,296 @@ +//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This file implements an interface defined in ResourceScriptToken.h. +// In particular, it defines an .rc script tokenizer. +// +//===---------------------------------------------------------------------===// + +#include "ResourceScriptToken.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include +#include + +using namespace llvm; + +using Kind = RCToken::Kind; + +// Checks if Representation is a correct description of an RC integer. +// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), +// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' +// character (that is the difference between our representation and +// StringRef's one). If Representation is correct, 'true' is returned and +// the return value is put back in Num. +static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { + size_t Length = Representation.size(); + if (Length == 0) + return false; + // Strip the last 'L' if unnecessary. + if (std::toupper(Representation.back()) == 'L') + Representation = Representation.drop_back(1); + + return !Representation.getAsInteger(0, Num); +} + +RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) + : TokenKind(RCTokenKind), TokenValue(Value) {} + +uint32_t RCToken::intValue() const { + assert(TokenKind == Kind::Int); + // We assume that the token already is a correct integer (checked by + // rcGetAsInteger). + uint32_t Result; + bool IsSuccess = rcGetAsInteger(TokenValue, Result); + assert(IsSuccess); + (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. + return Result; +} + +StringRef RCToken::value() const { return TokenValue; } + +Kind RCToken::kind() const { return TokenKind; } + +static Error getStringError(const Twine &message) { + return make_error("Error parsing file: " + message, + inconvertibleErrorCode()); +} + +namespace { + +class Tokenizer { +public: + Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} + + Expected> run(); + +private: + // All 'advancing' methods return boolean values; if they're equal to false, + // the stream has ended or failed. + bool advance(size_t Amount = 1); + bool skipWhitespaces(); + + // Consumes a token. If any problem occurred, a non-empty Error is returned. + Error consumeToken(const Kind TokenKind); + + // Check if tokenizer is about to read FollowingChars. + bool willNowRead(StringRef FollowingChars) const; + + // Check if tokenizer can start reading an identifier at current position. + // The original tool did non specify the rules to determine what is a correct + // identifier. We assume they should follow the C convention: + // [a-zA-z_][a-zA-Z0-9_]*. + bool canStartIdentifier() const; + // Check if tokenizer can continue reading an identifier. + bool canContinueIdentifier() const; + + // Check if tokenizer can start reading an integer. + // A correct integer always starts with a 0-9 digit, + // can contain characters 0-9A-Fa-f (digits), + // Ll (marking the integer is 32-bit), Xx (marking the representation + // is hexadecimal). As some kind of separator should come after the + // integer, we can consume the integer until a non-alphanumeric + // character. + bool canStartInt() const; + bool canContinueInt() const; + + bool canStartString() const; + + bool streamEof() const; + + // Classify the token that is about to be read from the current position. + Kind classifyCurrentToken() const; + + // Process the Kind::Identifier token - check if it is + // an identifier describing a block start or end. + void processIdentifier(RCToken &token) const; + + StringRef Data; + size_t DataLength, Pos; +}; + +Expected> Tokenizer::run() { + Pos = 0; + std::vector Result; + + // Consume an optional UTF-8 Byte Order Mark. + if (willNowRead("\xef\xbb\xbf")) + advance(3); + + while (!streamEof()) { + if (!skipWhitespaces()) + break; + + Kind TokenKind = classifyCurrentToken(); + if (TokenKind == Kind::Invalid) + return getStringError("Invalid token found at position " + Twine(Pos)); + + const size_t TokenStart = Pos; + if (Error TokenError = consumeToken(TokenKind)) + return std::move(TokenError); + + RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); + if (TokenKind == Kind::Identifier) { + processIdentifier(Token); + } else if (TokenKind == Kind::Int) { + uint32_t TokenInt; + if (!rcGetAsInteger(Token.value(), TokenInt)) { + // The integer has incorrect format or cannot be represented in + // a 32-bit integer. + return getStringError("Integer invalid or too large: " + + Token.value().str()); + } + } + + Result.push_back(Token); + } + + return Result; +} + +bool Tokenizer::advance(size_t Amount) { + Pos += Amount; + return !streamEof(); +} + +bool Tokenizer::skipWhitespaces() { + while (!streamEof() && std::isspace(Data[Pos])) + advance(); + return !streamEof(); +} + +Error Tokenizer::consumeToken(const Kind TokenKind) { + switch (TokenKind) { + // One-character token consumption. +#define TOKEN(Name) +#define SHORT_TOKEN(Name, Ch) case Kind::Name: +#include "ResourceScriptTokenList.h" +#undef TOKEN +#undef SHORT_TOKEN + advance(); + return Error::success(); + + case Kind::Identifier: + while (!streamEof() && canContinueIdentifier()) + advance(); + return Error::success(); + + case Kind::Int: + while (!streamEof() && canContinueInt()) + advance(); + return Error::success(); + + case Kind::String: + // Consume the preceding 'L', if there is any. + if (std::toupper(Data[Pos]) == 'L') + advance(); + // Consume the double-quote. + advance(); + + // Consume the characters until the end of the file, line or string. + while (true) { + if (streamEof()) { + return getStringError("Unterminated string literal."); + } else if (Data[Pos] == '"') { + // Consume the ending double-quote. + advance(); + return Error::success(); + } else if (Data[Pos] == '\n') { + return getStringError("String literal not terminated in the line."); + } + + advance(); + } + + case Kind::Invalid: + assert(false && "Cannot consume an invalid token."); + } +} + +bool Tokenizer::willNowRead(StringRef FollowingChars) const { + return Data.drop_front(Pos).startswith(FollowingChars); +} + +bool Tokenizer::canStartIdentifier() const { + assert(!streamEof()); + + const char CurChar = Data[Pos]; + return std::isalpha(CurChar) || CurChar == '_'; +} + +bool Tokenizer::canContinueIdentifier() const { + assert(!streamEof()); + const char CurChar = Data[Pos]; + return std::isalnum(CurChar) || CurChar == '_'; +} + +bool Tokenizer::canStartInt() const { + assert(!streamEof()); + return std::isdigit(Data[Pos]); +} + +bool Tokenizer::canContinueInt() const { + assert(!streamEof()); + return std::isalnum(Data[Pos]); +} + +bool Tokenizer::canStartString() const { + return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); +} + +bool Tokenizer::streamEof() const { return Pos == DataLength; } + +Kind Tokenizer::classifyCurrentToken() const { + if (canStartInt()) + return Kind::Int; + if (canStartString()) + return Kind::String; + // BEGIN and END are at this point of lexing recognized as identifiers. + if (canStartIdentifier()) + return Kind::Identifier; + + const char CurChar = Data[Pos]; + + switch (CurChar) { + // One-character token classification. +#define TOKEN(Name) +#define SHORT_TOKEN(Name, Ch) \ + case Ch: \ + return Kind::Name; +#include "ResourceScriptTokenList.h" +#undef TOKEN +#undef SHORT_TOKEN + + default: + return Kind::Invalid; + } +} + +void Tokenizer::processIdentifier(RCToken &Token) const { + assert(Token.kind() == Kind::Identifier); + StringRef Name = Token.value(); + + if (Name.equals_lower("begin")) + Token = RCToken(Kind::BlockBegin, Name); + else if (Name.equals_lower("end")) + Token = RCToken(Kind::BlockEnd, Name); +} + +} // anonymous namespace + +namespace llvm { + +Expected> tokenizeRC(StringRef Input) { + return Tokenizer(Input).run(); +} + +} // namespace llvm diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.h b/llvm/tools/llvm-rc/ResourceScriptToken.h new file mode 100644 index 000000000000..268f37a9d000 --- /dev/null +++ b/llvm/tools/llvm-rc/ResourceScriptToken.h @@ -0,0 +1,81 @@ +//===-- ResourceScriptToken.h -----------------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This declares the .rc script tokens and defines an interface for tokenizing +// the input data. The list of available tokens is located at +// ResourceScriptTokenList.h. +// +// Note that the tokenizer does not support comments or preprocessor +// directives. The preprocessor should do its work on the .rc file before +// running llvm-rc. +// +// As for now, it is possible to parse ASCII files only (the behavior on +// UTF files might be undefined). However, it already consumes UTF-8 BOM, if +// there is any. Thus, ASCII-compatible UTF-8 files are tokenized correctly. +// +// Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380599(v=vs.85).aspx +// +//===---------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H +#define LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +#include +#include +#include +#include + +namespace llvm { + +// A definition of a single resource script token. Each token has its kind +// (declared in ResourceScriptTokenList) and holds a value - a reference +// representation of the token. +// RCToken does not claim ownership on its value. A memory buffer containing +// the token value should be stored in a safe place and cannot be freed +// nor reallocated. +class RCToken { +public: + enum class Kind { +#define TOKEN(Name) Name, +#define SHORT_TOKEN(Name, Ch) Name, +#include "ResourceScriptTokenList.h" +#undef TOKEN +#undef SHORT_TOKEN + }; + + RCToken(RCToken::Kind RCTokenKind, StringRef Value); + + // Get an integer value of the integer token. + uint32_t intValue() const; + + StringRef value() const; + Kind kind() const; + +private: + Kind TokenKind; + StringRef TokenValue; +}; + +// Tokenize Input. +// In case no error occured, the return value contains +// tokens in order they were in the input file. +// In case of any error, the return value contains +// a textual representation of error. +// +// Tokens returned by this function hold only references to the parts +// of the Input. Memory buffer containing Input cannot be freed, +// modified or reallocated. +Expected> tokenizeRC(StringRef Input); + +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-rc/ResourceScriptTokenList.h b/llvm/tools/llvm-rc/ResourceScriptTokenList.h new file mode 100644 index 000000000000..f8d7303e7a8a --- /dev/null +++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.h @@ -0,0 +1,35 @@ +//===-- ResourceScriptTokenList.h -------------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This is a part of llvm-rc tokenizer. It lists all the possible tokens +// that might occur in a correct .rc script. +// +//===---------------------------------------------------------------------===// + + +// Long tokens. They might consist of more than one character. +TOKEN(Invalid) // Invalid token. Should not occur in a valid script. +TOKEN(Int) // Integer (decimal, octal or hexadecimal). +TOKEN(String) // String value. +TOKEN(Identifier) // Script identifier (resource name or type). + +// Short tokens. They usually consist of exactly one character. +// The definitions are of the form SHORT_TOKEN(TokenName, TokenChar). +// TokenChar is the one-character token representation occuring in the correct +// .rc scripts. +SHORT_TOKEN(BlockBegin, '{') // Start of the script block; can also be BEGIN. +SHORT_TOKEN(BlockEnd, '}') // End of the block; can also be END. +SHORT_TOKEN(Comma, ',') // Comma - resource arguments separator. +SHORT_TOKEN(Plus, '+') // Addition operator. +SHORT_TOKEN(Minus, '-') // Subtraction operator. +SHORT_TOKEN(Pipe, '|') // Bitwise-OR operator. +SHORT_TOKEN(Amp, '&') // Bitwise-AND operator. +SHORT_TOKEN(Tilde, '~') // Bitwise-NOT operator. +SHORT_TOKEN(LeftParen, '(') // Left parenthesis in the script expressions. +SHORT_TOKEN(RightParen, ')') // Right parenthesis. diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp index aa97bf9c0d5b..098daba14e21 100644 --- a/llvm/tools/llvm-rc/llvm-rc.cpp +++ b/llvm/tools/llvm-rc/llvm-rc.cpp @@ -1,4 +1,4 @@ -//===- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*--===// +//===-- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,6 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "ResourceScriptToken.h" + #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" @@ -60,6 +62,12 @@ public: }; static ExitOnError ExitOnErr; + +LLVM_ATTRIBUTE_NORETURN static void fatalError(Twine Message) { + errs() << Message << "\n"; + exit(1); +} + } // anonymous namespace int main(int argc_, const char *argv_[]) { @@ -81,8 +89,49 @@ int main(int argc_, const char *argv_[]) { opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC); // The tool prints nothing when invoked with no command-line arguments. - if (InputArgs.hasArg(OPT_HELP)) + if (InputArgs.hasArg(OPT_HELP)) { T.PrintHelp(outs(), "rc", "Resource Converter", false); + return 0; + } + + const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE); + + std::vector InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT); + if (InArgsInfo.size() != 1) { + fatalError("Exactly one input file should be provided."); + } + + // Read and tokenize the input file. + const Twine &Filename = InArgsInfo[0]; + ErrorOr> File = MemoryBuffer::getFile(Filename); + if (!File) { + fatalError("Error opening file '" + Filename + + "': " + File.getError().message()); + } + + std::unique_ptr FileContents = std::move(*File); + StringRef Contents = FileContents->getBuffer(); + + std::vector Tokens = ExitOnErr(tokenizeRC(Contents)); + + if (BeVerbose) { + const Twine TokenNames[] = { +#define TOKEN(Name) #Name, +#define SHORT_TOKEN(Name, Ch) #Name, +#include "ResourceScriptTokenList.h" +#undef TOKEN +#undef SHORT_TOKEN + }; + + for (const RCToken &Token : Tokens) { + outs() << TokenNames[static_cast(Token.kind())] << ": " + << Token.value(); + if (Token.kind() == RCToken::Kind::Int) + outs() << "; int value = " << Token.intValue(); + + outs() << "\n"; + } + } return 0; }