[COFF] Add and use a zero-copy tokenizer for .drectve

This generalizes the main Windows command line tokenizer to be able to produce StringRef substrings as well as freshly copied C strings. The implementation is still shared with the normal tokenizer, which is important, because we have unit tests for that. .drective sections can be very long. They can potentially list up to every symbol in the object file by name. It is worth avoiding these string copies. This saves a lot of memory when linking chrome.dll with PGO instrumentation: BEFORE AFTER % IMP peak memory: 6657.76MB 4983.54MB -25% real: 4m30.875s 2m26.250s -46% The time improvement may not be real, my machine was noisy while running this, but that the peak memory usage improvement should be real. This change may also help apps that heavily use dllexport annotations, because those also use linker directives in object files. Apps that do not use many directives are unlikely to be affected. Reviewed By: thakis Differential Revision: https://reviews.llvm.org/D79262
2020-05-01 07:34:12 -07:00 · 2020-05-01 07:34:12 -07:00 · 270d3faf6e
parent f7bf28b2c0
commit 270d3faf6e
3 changed files with 103 additions and 63 deletions
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@ -861,19 +861,25 @@ opt::InputArgList ArgParser::parse(ArrayRef<const char *> argv) {
 }
 // Tokenizes and parses a given string as command line in .drective section.
 // /EXPORT options are processed in fastpath.
 ParsedDirectives ArgParser::parseDirectives(StringRef s) {
  ParsedDirectives result;
  SmallVector<const char *, 16> rest;
-  for (StringRef tok : tokenize(s)) {
+  // Handle /EXPORT and /INCLUDE in a fast path. These directives can appear for
  // potentially every symbol in the object, so they must be handled quickly.
  SmallVector<StringRef, 16> tokens;
  cl::TokenizeWindowsCommandLineNoCopy(s, saver, tokens);
  for (StringRef tok : tokens) {
    if (tok.startswith_lower("/export:") || tok.startswith_lower("-export:"))
      result.exports.push_back(tok.substr(strlen("/export:")));
    else if (tok.startswith_lower("/include:") ||
             tok.startswith_lower("-include:"))
      result.includes.push_back(tok.substr(strlen("/include:")));
-    else
+    else {
-      rest.push_back(tok.data());
+      // Save non-null-terminated strings to make proper C strings.
      bool HasNul = tok.data()[tok.size()] == '\0';
      rest.push_back(HasNul ? tok.data() : saver.save(tok).data());
    }
  }
  // Make InputArgList from unparsed string vectors.
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@ -2027,6 +2027,13 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
                                SmallVectorImpl<const char *> &NewArgv,
                                bool MarkEOLs = false);
 /// Tokenizes a Windows command line while attempting to avoid copies. If no
 /// quoting or escaping was used, this produces substrings of the original
 /// string. If a token requires unquoting, it will be allocated with the
 /// StringSaver.
 void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver,
                                      SmallVectorImpl<StringRef> &NewArgv);
 /// String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@ -919,91 +919,118 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
  return I - 1;
 }
-void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
+// Windows treats whitespace, double quotes, and backslashes specially.
-                                    SmallVectorImpl<const char *> &NewArgv,
+static bool isWindowsSpecialChar(char C) {
-                                    bool MarkEOLs) {
+  return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
 }
 // Windows tokenization implementation. The implementation is designed to be
 // inlined and specialized for the two user entry points.
 static inline void
 tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
                               function_ref<void(StringRef)> AddToken,
                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
  SmallString<128> Token;
-  // This is a small state machine to consume characters until it reaches the
+  // Try to do as much work inside the state machine as possible.
  // end of the source string.
  enum { INIT, UNQUOTED, QUOTED } State = INIT;
-  for (size_t I = 0, E = Src.size(); I != E; ++I) {
+  for (size_t I = 0, E = Src.size(); I < E; ++I) {
-    char C = Src[I];
+    switch (State) {
-
+    case INIT: {
-    // INIT state indicates that the current input index is at the start of
+      assert(Token.empty() && "token should be empty in initial state");
-    // the string or between tokens.
+      // Eat whitespace before a token.
-    if (State == INIT) {
+      while (I < E && isWhitespaceOrNull(Src[I])) {
-      if (isWhitespaceOrNull(C)) {
+        if (Src[I] == '\n')
-        // Mark the end of lines in response files
+          MarkEOL();
-        if (MarkEOLs && C == '\n')
+        ++I;
          NewArgv.push_back(nullptr);
        continue;
      }
-      if (C == '"') {
+      // Stop if this was trailing whitespace.
      if (I >= E)
        break;
      size_t Start = I;
      while (I < E && !isWindowsSpecialChar(Src[I]))
        ++I;
      StringRef NormalChars = Src.slice(Start, I);
      if (I >= E || isWhitespaceOrNull(Src[I])) {
        if (I < E && Src[I] == '\n')
          MarkEOL();
        // No special characters: slice out the substring and start the next
        // token. Copy the string if the caller asks us to.
        AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
      } else if (Src[I] == '\"') {
        Token += NormalChars;
        State = QUOTED;
-        continue;
+      } else if (Src[I] == '\\') {
-      }
+        Token += NormalChars;
      if (C == '\\') {
        I = parseBackslash(Src, I, Token);
        State = UNQUOTED;
-        continue;
+      } else {
        llvm_unreachable("unexpected special character");
      }
-      Token.push_back(C);
+      break;
      State = UNQUOTED;
      continue;
    }
-    // UNQUOTED state means that it's reading a token not quoted by double
+    case UNQUOTED:
-    // quotes.
+      if (isWhitespaceOrNull(Src[I])) {
-    if (State == UNQUOTED) {
+        // Whitespace means the end of the token. If we are in this state, the
-      // Whitespace means the end of the token.
+        // token must have contained a special character, so we must copy the
-      if (isWhitespaceOrNull(C)) {
+        // token.
-        NewArgv.push_back(Saver.save(StringRef(Token)).data());
+        AddToken(Saver.save(Token.str()));
        Token.clear();
        if (Src[I] == '\n')
          MarkEOL();
        State = INIT;
-        // Mark the end of lines in response files
+      } else if (Src[I] == '\"') {
        if (MarkEOLs && C == '\n')
          NewArgv.push_back(nullptr);
        continue;
      }
      if (C == '"') {
        State = QUOTED;
-        continue;
+      } else if (Src[I] == '\\') {
      }
      if (C == '\\') {
        I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
-      }
+        Token.push_back(Src[I]);
      Token.push_back(C);
      continue;
      }
      break;
-    // QUOTED state means that it's reading a token quoted by double quotes.
+    case QUOTED:
-    if (State == QUOTED) {
+      if (Src[I] == '\"') {
      if (C == '"') {
        if (I < (E - 1) && Src[I + 1] == '"') {
          // Consecutive double-quotes inside a quoted string implies one
          // double-quote.
          Token.push_back('"');
-          I = I + 1;
+          ++I;
-          continue;
+        } else {
-        }
+          // Otherwise, end the quoted portion and return to the unquoted state.
          State = UNQUOTED;
        continue;
        }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
        I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
        Token.push_back(Src[I]);
      }
-      Token.push_back(C);
+      break;
    }
  }
-  // Append the last token after hitting EOF with no whitespace.
+
  if (!Token.empty())
-    NewArgv.push_back(Saver.save(StringRef(Token)).data());
+    AddToken(Saver.save(Token.str()));
-  // Mark the end of response files
+}
 void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
                                    SmallVectorImpl<const char *> &NewArgv,
                                    bool MarkEOLs) {
  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
  auto OnEOL = [&]() {
    if (MarkEOLs)
      NewArgv.push_back(nullptr);
  };
  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
                                 /*AlwaysCopy=*/true, OnEOL);
 }
 void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
                                          SmallVectorImpl<StringRef> &NewArgv) {
  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
  auto OnEOL = []() {};
  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
                                 OnEOL);
 }
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,