[COFF] Add and use a zero-copy tokenizer for .drectve

This generalizes the main Windows command line tokenizer to be able to
produce StringRef substrings as well as freshly copied C strings. The
implementation is still shared with the normal tokenizer, which is
important, because we have unit tests for that.

.drective sections can be very long. They can potentially list up to
every symbol in the object file by name. It is worth avoiding these
string copies.

This saves a lot of memory when linking chrome.dll with PGO
instrumentation:

             BEFORE      AFTER      % IMP
peak memory: 6657.76MB   4983.54MB  -25%
real:        4m30.875s   2m26.250s  -46%

The time improvement may not be real, my machine was noisy while running
this, but that the peak memory usage improvement should be real.

This change may also help apps that heavily use dllexport annotations,
because those also use linker directives in object files. Apps that do
not use many directives are unlikely to be affected.

Reviewed By: thakis

Differential Revision: https://reviews.llvm.org/D79262
This commit is contained in:
Reid Kleckner 2020-05-01 07:34:12 -07:00
parent f7bf28b2c0
commit 270d3faf6e
3 changed files with 103 additions and 63 deletions

View File

@ -861,19 +861,25 @@ opt::InputArgList ArgParser::parse(ArrayRef<const char *> argv) {
}
// Tokenizes and parses a given string as command line in .drective section.
// /EXPORT options are processed in fastpath.
ParsedDirectives ArgParser::parseDirectives(StringRef s) {
ParsedDirectives result;
SmallVector<const char *, 16> rest;
for (StringRef tok : tokenize(s)) {
// Handle /EXPORT and /INCLUDE in a fast path. These directives can appear for
// potentially every symbol in the object, so they must be handled quickly.
SmallVector<StringRef, 16> tokens;
cl::TokenizeWindowsCommandLineNoCopy(s, saver, tokens);
for (StringRef tok : tokens) {
if (tok.startswith_lower("/export:") || tok.startswith_lower("-export:"))
result.exports.push_back(tok.substr(strlen("/export:")));
else if (tok.startswith_lower("/include:") ||
tok.startswith_lower("-include:"))
result.includes.push_back(tok.substr(strlen("/include:")));
else
rest.push_back(tok.data());
else {
// Save non-null-terminated strings to make proper C strings.
bool HasNul = tok.data()[tok.size()] == '\0';
rest.push_back(HasNul ? tok.data() : saver.save(tok).data());
}
}
// Make InputArgList from unparsed string vectors.

View File

@ -2027,6 +2027,13 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
SmallVectorImpl<const char *> &NewArgv,
bool MarkEOLs = false);
/// Tokenizes a Windows command line while attempting to avoid copies. If no
/// quoting or escaping was used, this produces substrings of the original
/// string. If a token requires unquoting, it will be allocated with the
/// StringSaver.
void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver,
SmallVectorImpl<StringRef> &NewArgv);
/// String tokenization function type. Should be compatible with either
/// Windows or Unix command line tokenizers.
using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,

View File

@ -919,91 +919,118 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
return I - 1;
}
void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
SmallVectorImpl<const char *> &NewArgv,
bool MarkEOLs) {
// Windows treats whitespace, double quotes, and backslashes specially.
static bool isWindowsSpecialChar(char C) {
return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
}
// Windows tokenization implementation. The implementation is designed to be
// inlined and specialized for the two user entry points.
static inline void
tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
function_ref<void(StringRef)> AddToken,
bool AlwaysCopy, function_ref<void()> MarkEOL) {
SmallString<128> Token;
// This is a small state machine to consume characters until it reaches the
// end of the source string.
// Try to do as much work inside the state machine as possible.
enum { INIT, UNQUOTED, QUOTED } State = INIT;
for (size_t I = 0, E = Src.size(); I != E; ++I) {
char C = Src[I];
// INIT state indicates that the current input index is at the start of
// the string or between tokens.
if (State == INIT) {
if (isWhitespaceOrNull(C)) {
// Mark the end of lines in response files
if (MarkEOLs && C == '\n')
NewArgv.push_back(nullptr);
continue;
for (size_t I = 0, E = Src.size(); I < E; ++I) {
switch (State) {
case INIT: {
assert(Token.empty() && "token should be empty in initial state");
// Eat whitespace before a token.
while (I < E && isWhitespaceOrNull(Src[I])) {
if (Src[I] == '\n')
MarkEOL();
++I;
}
if (C == '"') {
// Stop if this was trailing whitespace.
if (I >= E)
break;
size_t Start = I;
while (I < E && !isWindowsSpecialChar(Src[I]))
++I;
StringRef NormalChars = Src.slice(Start, I);
if (I >= E || isWhitespaceOrNull(Src[I])) {
if (I < E && Src[I] == '\n')
MarkEOL();
// No special characters: slice out the substring and start the next
// token. Copy the string if the caller asks us to.
AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
} else if (Src[I] == '\"') {
Token += NormalChars;
State = QUOTED;
continue;
}
if (C == '\\') {
} else if (Src[I] == '\\') {
Token += NormalChars;
I = parseBackslash(Src, I, Token);
State = UNQUOTED;
continue;
} else {
llvm_unreachable("unexpected special character");
}
Token.push_back(C);
State = UNQUOTED;
continue;
break;
}
// UNQUOTED state means that it's reading a token not quoted by double
// quotes.
if (State == UNQUOTED) {
// Whitespace means the end of the token.
if (isWhitespaceOrNull(C)) {
NewArgv.push_back(Saver.save(StringRef(Token)).data());
case UNQUOTED:
if (isWhitespaceOrNull(Src[I])) {
// Whitespace means the end of the token. If we are in this state, the
// token must have contained a special character, so we must copy the
// token.
AddToken(Saver.save(Token.str()));
Token.clear();
if (Src[I] == '\n')
MarkEOL();
State = INIT;
// Mark the end of lines in response files
if (MarkEOLs && C == '\n')
NewArgv.push_back(nullptr);
continue;
}
if (C == '"') {
} else if (Src[I] == '\"') {
State = QUOTED;
continue;
}
if (C == '\\') {
} else if (Src[I] == '\\') {
I = parseBackslash(Src, I, Token);
continue;
} else {
Token.push_back(Src[I]);
}
Token.push_back(C);
continue;
}
break;
// QUOTED state means that it's reading a token quoted by double quotes.
if (State == QUOTED) {
if (C == '"') {
case QUOTED:
if (Src[I] == '\"') {
if (I < (E - 1) && Src[I + 1] == '"') {
// Consecutive double-quotes inside a quoted string implies one
// double-quote.
Token.push_back('"');
I = I + 1;
continue;
++I;
} else {
// Otherwise, end the quoted portion and return to the unquoted state.
State = UNQUOTED;
}
State = UNQUOTED;
continue;
}
if (C == '\\') {
} else if (Src[I] == '\\') {
I = parseBackslash(Src, I, Token);
continue;
} else {
Token.push_back(Src[I]);
}
Token.push_back(C);
break;
}
}
// Append the last token after hitting EOF with no whitespace.
if (!Token.empty())
NewArgv.push_back(Saver.save(StringRef(Token)).data());
// Mark the end of response files
if (MarkEOLs)
NewArgv.push_back(nullptr);
AddToken(Saver.save(Token.str()));
}
void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
SmallVectorImpl<const char *> &NewArgv,
bool MarkEOLs) {
auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
auto OnEOL = [&]() {
if (MarkEOLs)
NewArgv.push_back(nullptr);
};
tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
/*AlwaysCopy=*/true, OnEOL);
}
void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
SmallVectorImpl<StringRef> &NewArgv) {
auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
auto OnEOL = []() {};
tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
OnEOL);
}
void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,