Add .rc scripts tokenizer.

This extends the shell of llvm-rc tool with the ability of tokenization of the input files. Currently, ASCII and ASCII-compatible UTF-8 files are supported. Thanks to Nico Weber (thakis) for his original work in this area. Differential Revision: https://reviews.llvm.org/D35957 llvm-svn: 310621
2017-08-10 16:21:44 +00:00 · 2017-08-10 16:21:44 +00:00 · 719e22d4f4
parent 27fbd1e14a
commit 719e22d4f4
7 changed files with 507 additions and 2 deletions
--- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
@ -0,0 +1,8 @@
 1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
 He11o LLVM
 "RC string test.",L"Another RC string test.'&{",42,100
         ":))"
--- a/llvm/test/tools/llvm-rc/tokenizer.test
+++ b/llvm/test/tools/llvm-rc/tokenizer.test
@ -0,0 +1,35 @@
 ; RUN: llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s
 ; CHECK:  Int: 1; int value = 1
 ; CHECK-NEXT:  Plus: +
 ; CHECK-NEXT:  Int: 2; int value = 2
 ; CHECK-NEXT:  Minus: -
 ; CHECK-NEXT:  Int: 3214L; int value = 3214
 ; CHECK-NEXT:  Amp: &
 ; CHECK-NEXT:  Int: 0x120894; int value = 1181844
 ; CHECK-NEXT:  Int: 032173; int value = 13435
 ; CHECK-NEXT:  Int: 2; int value = 2
 ; CHECK-NEXT:  Pipe: |
 ; CHECK-NEXT:  Amp: &
 ; CHECK-NEXT:  Tilde: ~
 ; CHECK-NEXT:  Plus: +
 ; CHECK-NEXT:  LeftParen: (
 ; CHECK-NEXT:  Minus: -
 ; CHECK-NEXT:  Int: 7; int value = 7
 ; CHECK-NEXT:  RightParen: )
 ; CHECK-NEXT:  BlockBegin: {
 ; CHECK-NEXT:  Int: 0xabcdef; int value = 11259375
 ; CHECK-NEXT:  Int: 0xABCDEFl; int value = 11259375
 ; CHECK-NEXT:  BlockEnd: }
 ; CHECK-NEXT:  BlockBegin: Begin
 ; CHECK-NEXT:  BlockEnd: End
 ; CHECK-NEXT:  Identifier: He11o
 ; CHECK-NEXT:  Identifier: LLVM
 ; CHECK-NEXT:  String: "RC string test."
 ; CHECK-NEXT:  Comma: ,
 ; CHECK-NEXT:  String: L"Another RC string test.'&{"
 ; CHECK-NEXT:  Comma: ,
 ; CHECK-NEXT:  Int: 42; int value = 42
 ; CHECK-NEXT:  Comma: ,
 ; CHECK-NEXT:  Int: 100; int value = 100
 ; CHECK-NEXT:  String: ":))"
--- a/llvm/tools/llvm-rc/CMakeLists.txt
+++ b/llvm/tools/llvm-rc/CMakeLists.txt
@ -10,4 +10,5 @@ add_public_tablegen_target(RcTableGen)
 add_llvm_tool(llvm-rc
  llvm-rc.cpp
  ResourceScriptToken.cpp
  )
--- a/llvm/tools/llvm-rc/ResourceScriptToken.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
@ -0,0 +1,296 @@
 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===---------------------------------------------------------------------===//
 //
 // This file implements an interface defined in ResourceScriptToken.h.
 // In particular, it defines an .rc script tokenizer.
 //
 //===---------------------------------------------------------------------===//
 #include "ResourceScriptToken.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <cstdlib>
 #include <utility>
 using namespace llvm;
 using Kind = RCToken::Kind;
 // Checks if Representation is a correct description of an RC integer.
 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
 // character (that is the difference between our representation and
 // StringRef's one). If Representation is correct, 'true' is returned and
 // the return value is put back in Num.
 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
  size_t Length = Representation.size();
  if (Length == 0)
    return false;
  // Strip the last 'L' if unnecessary.
  if (std::toupper(Representation.back()) == 'L')
    Representation = Representation.drop_back(1);
  return !Representation.getAsInteger<uint32_t>(0, Num);
 }
 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
    : TokenKind(RCTokenKind), TokenValue(Value) {}
 uint32_t RCToken::intValue() const {
  assert(TokenKind == Kind::Int);
  // We assume that the token already is a correct integer (checked by
  // rcGetAsInteger).
  uint32_t Result;
  bool IsSuccess = rcGetAsInteger(TokenValue, Result);
  assert(IsSuccess);
  (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
  return Result;
 }
 StringRef RCToken::value() const { return TokenValue; }
 Kind RCToken::kind() const { return TokenKind; }
 static Error getStringError(const Twine &message) {
  return make_error<StringError>("Error parsing file: " + message,
                                 inconvertibleErrorCode());
 }
 namespace {
 class Tokenizer {
 public:
  Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
  Expected<std::vector<RCToken>> run();
 private:
  // All 'advancing' methods return boolean values; if they're equal to false,
  // the stream has ended or failed.
  bool advance(size_t Amount = 1);
  bool skipWhitespaces();
  // Consumes a token. If any problem occurred, a non-empty Error is returned.
  Error consumeToken(const Kind TokenKind);
  // Check if tokenizer is about to read FollowingChars.
  bool willNowRead(StringRef FollowingChars) const;
  // Check if tokenizer can start reading an identifier at current position.
  // The original tool did non specify the rules to determine what is a correct
  // identifier. We assume they should follow the C convention:
  // [a-zA-z_][a-zA-Z0-9_]*.
  bool canStartIdentifier() const;
  // Check if tokenizer can continue reading an identifier.
  bool canContinueIdentifier() const;
  // Check if tokenizer can start reading an integer.
  // A correct integer always starts with a 0-9 digit,
  // can contain characters 0-9A-Fa-f (digits),
  // Ll (marking the integer is 32-bit), Xx (marking the representation
  // is hexadecimal). As some kind of separator should come after the
  // integer, we can consume the integer until a non-alphanumeric
  // character.
  bool canStartInt() const;
  bool canContinueInt() const;
  bool canStartString() const;
  bool streamEof() const;
  // Classify the token that is about to be read from the current position.
  Kind classifyCurrentToken() const;
  // Process the Kind::Identifier token - check if it is
  // an identifier describing a block start or end.
  void processIdentifier(RCToken &token) const;
  StringRef Data;
  size_t DataLength, Pos;
 };
 Expected<std::vector<RCToken>> Tokenizer::run() {
  Pos = 0;
  std::vector<RCToken> Result;
  // Consume an optional UTF-8 Byte Order Mark.
  if (willNowRead("\xef\xbb\xbf"))
    advance(3);
  while (!streamEof()) {
    if (!skipWhitespaces())
      break;
    Kind TokenKind = classifyCurrentToken();
    if (TokenKind == Kind::Invalid)
      return getStringError("Invalid token found at position " + Twine(Pos));
    const size_t TokenStart = Pos;
    if (Error TokenError = consumeToken(TokenKind))
      return std::move(TokenError);
    RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
    if (TokenKind == Kind::Identifier) {
      processIdentifier(Token);
    } else if (TokenKind == Kind::Int) {
      uint32_t TokenInt;
      if (!rcGetAsInteger(Token.value(), TokenInt)) {
        // The integer has incorrect format or cannot be represented in
        // a 32-bit integer.
        return getStringError("Integer invalid or too large: " +
                              Token.value().str());
      }
    }
    Result.push_back(Token);
  }
  return Result;
 }
 bool Tokenizer::advance(size_t Amount) {
  Pos += Amount;
  return !streamEof();
 }
 bool Tokenizer::skipWhitespaces() {
  while (!streamEof() && std::isspace(Data[Pos]))
    advance();
  return !streamEof();
 }
 Error Tokenizer::consumeToken(const Kind TokenKind) {
  switch (TokenKind) {
  // One-character token consumption.
 #define TOKEN(Name)
 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
 #include "ResourceScriptTokenList.h"
 #undef TOKEN
 #undef SHORT_TOKEN
    advance();
    return Error::success();
  case Kind::Identifier:
    while (!streamEof() && canContinueIdentifier())
      advance();
    return Error::success();
  case Kind::Int:
    while (!streamEof() && canContinueInt())
      advance();
    return Error::success();
  case Kind::String:
    // Consume the preceding 'L', if there is any.
    if (std::toupper(Data[Pos]) == 'L')
      advance();
    // Consume the double-quote.
    advance();
    // Consume the characters until the end of the file, line or string.
    while (true) {
      if (streamEof()) {
        return getStringError("Unterminated string literal.");
      } else if (Data[Pos] == '"') {
        // Consume the ending double-quote.
        advance();
        return Error::success();
      } else if (Data[Pos] == '\n') {
        return getStringError("String literal not terminated in the line.");
      }
      advance();
    }
  case Kind::Invalid:
    assert(false && "Cannot consume an invalid token.");
  }
 }
 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
  return Data.drop_front(Pos).startswith(FollowingChars);
 }
 bool Tokenizer::canStartIdentifier() const {
  assert(!streamEof());
  const char CurChar = Data[Pos];
  return std::isalpha(CurChar) || CurChar == '_';
 }
 bool Tokenizer::canContinueIdentifier() const {
  assert(!streamEof());
  const char CurChar = Data[Pos];
  return std::isalnum(CurChar) || CurChar == '_';
 }
 bool Tokenizer::canStartInt() const {
  assert(!streamEof());
  return std::isdigit(Data[Pos]);
 }
 bool Tokenizer::canContinueInt() const {
  assert(!streamEof());
  return std::isalnum(Data[Pos]);
 }
 bool Tokenizer::canStartString() const {
  return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
 }
 bool Tokenizer::streamEof() const { return Pos == DataLength; }
 Kind Tokenizer::classifyCurrentToken() const {
  if (canStartInt())
    return Kind::Int;
  if (canStartString())
    return Kind::String;
  // BEGIN and END are at this point of lexing recognized as identifiers.
  if (canStartIdentifier())
    return Kind::Identifier;
  const char CurChar = Data[Pos];
  switch (CurChar) {
  // One-character token classification.
 #define TOKEN(Name)
 #define SHORT_TOKEN(Name, Ch)                                                  \
  case Ch:                                                                     \
    return Kind::Name;
 #include "ResourceScriptTokenList.h"
 #undef TOKEN
 #undef SHORT_TOKEN
  default:
    return Kind::Invalid;
  }
 }
 void Tokenizer::processIdentifier(RCToken &Token) const {
  assert(Token.kind() == Kind::Identifier);
  StringRef Name = Token.value();
  if (Name.equals_lower("begin"))
    Token = RCToken(Kind::BlockBegin, Name);
  else if (Name.equals_lower("end"))
    Token = RCToken(Kind::BlockEnd, Name);
 }
 } // anonymous namespace
 namespace llvm {
 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
  return Tokenizer(Input).run();
 }
 } // namespace llvm
--- a/llvm/tools/llvm-rc/ResourceScriptToken.h
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.h
@ -0,0 +1,81 @@
 //===-- ResourceScriptToken.h -----------------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===---------------------------------------------------------------------===//
 //
 // This declares the .rc script tokens and defines an interface for tokenizing
 // the input data. The list of available tokens is located at
 // ResourceScriptTokenList.h.
 //
 // Note that the tokenizer does not support comments or preprocessor
 // directives. The preprocessor should do its work on the .rc file before
 // running llvm-rc.
 //
 // As for now, it is possible to parse ASCII files only (the behavior on
 // UTF files might be undefined). However, it already consumes UTF-8 BOM, if
 // there is any. Thus, ASCII-compatible UTF-8 files are tokenized correctly.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380599(v=vs.85).aspx
 //
 //===---------------------------------------------------------------------===//
 #ifndef LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
 #define LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <map>
 #include <string>
 #include <vector>
 namespace llvm {
 // A definition of a single resource script token. Each token has its kind
 // (declared in ResourceScriptTokenList) and holds a value - a reference
 // representation of the token.
 // RCToken does not claim ownership on its value. A memory buffer containing
 // the token value should be stored in a safe place and cannot be freed
 // nor reallocated.
 class RCToken {
 public:
  enum class Kind {
 #define TOKEN(Name) Name,
 #define SHORT_TOKEN(Name, Ch) Name,
 #include "ResourceScriptTokenList.h"
 #undef TOKEN
 #undef SHORT_TOKEN
  };
  RCToken(RCToken::Kind RCTokenKind, StringRef Value);
  // Get an integer value of the integer token.
  uint32_t intValue() const;
  StringRef value() const;
  Kind kind() const;
 private:
  Kind TokenKind;
  StringRef TokenValue;
 };
 // Tokenize Input.
 // In case no error occured, the return value contains
 //   tokens in order they were in the input file.
 // In case of any error, the return value contains
 //   a textual representation of error.
 //
 // Tokens returned by this function hold only references to the parts
 // of the Input. Memory buffer containing Input cannot be freed,
 // modified or reallocated.
 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input);
 } // namespace llvm
 #endif
--- a/llvm/tools/llvm-rc/ResourceScriptTokenList.h
+++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.h
@ -0,0 +1,35 @@
 //===-- ResourceScriptTokenList.h -------------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===---------------------------------------------------------------------===//
 //
 // This is a part of llvm-rc tokenizer. It lists all the possible tokens
 // that might occur in a correct .rc script.
 //
 //===---------------------------------------------------------------------===//
 // Long tokens. They might consist of more than one character.
 TOKEN(Invalid)      // Invalid token. Should not occur in a valid script.
 TOKEN(Int)          // Integer (decimal, octal or hexadecimal).
 TOKEN(String)       // String value.
 TOKEN(Identifier)   // Script identifier (resource name or type).
 // Short tokens. They usually consist of exactly one character.
 // The definitions are of the form SHORT_TOKEN(TokenName, TokenChar).
 // TokenChar is the one-character token representation occuring in the correct
 // .rc scripts.
 SHORT_TOKEN(BlockBegin, '{')   // Start of the script block; can also be BEGIN.
 SHORT_TOKEN(BlockEnd, '}')     // End of the block; can also be END.
 SHORT_TOKEN(Comma, ',')        // Comma - resource arguments separator.
 SHORT_TOKEN(Plus, '+')         // Addition operator.
 SHORT_TOKEN(Minus, '-')        // Subtraction operator.
 SHORT_TOKEN(Pipe, '|')         // Bitwise-OR operator.
 SHORT_TOKEN(Amp, '&')          // Bitwise-AND operator.
 SHORT_TOKEN(Tilde, '~')        // Bitwise-NOT operator.
 SHORT_TOKEN(LeftParen, '(')    // Left parenthesis in the script expressions.
 SHORT_TOKEN(RightParen, ')')   // Right parenthesis.
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@ -1,4 +1,4 @@
-//===- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*--===//
+//===-- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "ResourceScriptToken.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
@ -60,6 +62,12 @@ public:
 };
 static ExitOnError ExitOnErr;
 LLVM_ATTRIBUTE_NORETURN static void fatalError(Twine Message) {
  errs() << Message << "\n";
  exit(1);
 }
 } // anonymous namespace
 int main(int argc_, const char *argv_[]) {
@ -81,8 +89,49 @@ int main(int argc_, const char *argv_[]) {
  opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
  // The tool prints nothing when invoked with no command-line arguments.
-  if (InputArgs.hasArg(OPT_HELP))
+  if (InputArgs.hasArg(OPT_HELP)) {
    T.PrintHelp(outs(), "rc", "Resource Converter", false);
    return 0;
  }
  const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE);
  std::vector<std::string> InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);
  if (InArgsInfo.size() != 1) {
    fatalError("Exactly one input file should be provided.");
  }
  // Read and tokenize the input file.
  const Twine &Filename = InArgsInfo[0];
  ErrorOr<std::unique_ptr<MemoryBuffer>> File = MemoryBuffer::getFile(Filename);
  if (!File) {
    fatalError("Error opening file '" + Filename +
               "': " + File.getError().message());
  }
  std::unique_ptr<MemoryBuffer> FileContents = std::move(*File);
  StringRef Contents = FileContents->getBuffer();
  std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(Contents));
  if (BeVerbose) {
    const Twine TokenNames[] = {
 #define TOKEN(Name) #Name,
 #define SHORT_TOKEN(Name, Ch) #Name,
 #include "ResourceScriptTokenList.h"
 #undef TOKEN
 #undef SHORT_TOKEN
    };
    for (const RCToken &Token : Tokens) {
      outs() << TokenNames[static_cast<int>(Token.kind())] << ": "
             << Token.value();
      if (Token.kind() == RCToken::Kind::Int)
        outs() << "; int value = " << Token.intValue();
      outs() << "\n";
    }
  }
  return 0;
 }