[pseudo] Add crude heuristics to choose taken preprocessor branches.

In files where different preprocessing paths are possible, our goal is to
choose a preprocessed token sequence which we can parse that pins down as much
of the grammatical structure as possible.
This forms the "primary parse", and the not-taken branches get parsed later,
and are constrained to be compatible with the primary parse.

Concretely:
  int x =
    #ifdef // TAKEN
      2 + 2 + 2 // determined during primary parse to be an expression
    #else
      2 // constrained to be an expression during a secondary parse
    #endif
    ;

Differential Revision: https://reviews.llvm.org/D121165
This commit is contained in:
Sam McCall 2022-03-07 23:33:40 +01:00
parent f4f1cf6c31
commit af89e4792d
6 changed files with 345 additions and 8 deletions

View File

@ -78,6 +78,11 @@ struct DirectiveMap {
std::vector<std::pair<Directive, DirectiveMap>> Branches;
/// The directive terminating the conditional, should be #endif.
Directive End;
/// The index of the conditional branch we chose as active.
/// None indicates no branch was taken (e.g. #if 0 ... #endif).
/// The initial map from of `parse()` has no branches marked as taken.
/// See `chooseConditionalBranches()`.
llvm::Optional<unsigned> Taken;
};
/// Some piece of the file. {One of Code, Directive, Conditional}.
@ -87,7 +92,6 @@ struct DirectiveMap {
/// Extract preprocessor structure by examining the raw tokens.
static DirectiveMap parse(const TokenStream &);
// FIXME: add heuristically selection of conditional branches.
// FIXME: allow deriving a preprocessed stream
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
@ -98,6 +102,24 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
const DirectiveMap::Conditional &);
/// Selects a "taken" branch for each conditional directive in the file.
///
/// The choice is somewhat arbitrary, but aims to produce a useful parse:
/// - idioms like `#if 0` are respected
/// - we avoid paths that reach `#error`
/// - we try to maximize the amount of code seen
/// The choice may also be "no branch taken".
///
/// Choices are also made for conditionals themselves inside not-taken branches:
/// #if 1 // taken!
/// #else // not taken
/// #if 1 // taken!
/// #endif
/// #endif
///
/// The choices are stored in Conditional::Taken nodes.
void chooseConditionalBranches(DirectiveMap &, const TokenStream &Code);
// FIXME: This approximates std::variant<Code, Directive, Conditional>.
// Switch once we can use C++17.
class DirectiveMap::Chunk {

View File

@ -74,6 +74,20 @@ struct Token {
Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
}
/// Returns the next token in the stream. this may not be a sentinel.
const Token &next() const {
assert(Kind != tok::eof);
return *(this + 1);
}
/// Returns the next token in the stream, skipping over comments.
const Token &nextNC() const {
const Token *T = this;
do
T = &T->next();
while (T->Kind == tok::comment);
return *T;
}
/// The type of token as determined by clang's lexer.
clang::tok::TokenKind Kind = clang::tok::unknown;
};

View File

@ -151,10 +151,11 @@ DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
static void dump(llvm::raw_ostream &OS,
const DirectiveMap::Directive &Directive, unsigned Indent) {
OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
tok::getPPKeywordSpelling(Directive.Kind),
Directive.Tokens.size());
const DirectiveMap::Directive &Directive, unsigned Indent,
bool Taken = false) {
OS.indent(Indent) << llvm::formatv(
"#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
Directive.Tokens.size(), Taken ? " TAKEN" : "");
}
static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
unsigned Indent) {
@ -163,8 +164,9 @@ static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
static void dump(llvm::raw_ostream &OS,
const DirectiveMap::Conditional &Conditional,
unsigned Indent) {
for (const auto &Branch : Conditional.Branches) {
dump(OS, Branch.first, Indent);
for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
const auto &Branch = Conditional.Branches[I];
dump(OS, Branch.first, Indent, Conditional.Taken == I);
dump(OS, Branch.second, Indent + 2);
}
dump(OS, Conditional.End, Indent);
@ -203,5 +205,147 @@ OSTREAM_DUMP(DirectiveMap::Conditional)
OSTREAM_DUMP(DirectiveMap::Code)
#undef OSTREAM_DUMP
namespace {
// Makes choices about conditional branches.
//
// Generally it tries to maximize the amount of useful code we see.
//
// Caveat: each conditional is evaluated independently. Consider this code:
// #ifdef WINDOWS
// bool isWindows = true;
// #endif
// #ifndef WINDOWS
// bool isWindows = false;
// #endif
// We take both branches and define isWindows twice. We could track more state
// in order to produce a consistent view, but this is complex.
class BranchChooser {
public:
BranchChooser(const TokenStream &Code) : Code(Code) {}
void choose(DirectiveMap &M) { walk(M); }
private:
// Describes code seen by making particular branch choices. Higher is better.
struct Score {
int Tokens = 0; // excluding comments and directives
int Directives = 0;
int Errors = 0; // #error directives
bool operator>(const Score &Other) const {
// Seeing errors is bad, other things are good.
return std::make_tuple(-Errors, Tokens, Directives) >
std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
}
Score &operator+=(const Score &Other) {
Tokens += Other.Tokens;
Directives += Other.Directives;
Errors += Other.Errors;
return *this;
}
};
Score walk(DirectiveMap::Code &C) {
Score S;
for (const Token &T : Code.tokens(C.Tokens))
if (T.Kind != tok::comment)
++S.Tokens;
return S;
}
Score walk(DirectiveMap::Directive &D) {
Score S;
S.Directives = 1;
S.Errors = D.Kind == tok::pp_error;
return S;
}
Score walk(DirectiveMap::Chunk &C) {
switch (C.kind()) {
case DirectiveMap::Chunk::K_Code:
return walk((DirectiveMap::Code &)C);
case DirectiveMap::Chunk::K_Directive:
return walk((DirectiveMap::Directive &)C);
case DirectiveMap::Chunk::K_Conditional:
return walk((DirectiveMap::Conditional &)C);
case DirectiveMap::Chunk::K_Empty:
break;
}
llvm_unreachable("bad chunk kind");
}
Score walk(DirectiveMap &M) {
Score S;
for (DirectiveMap::Chunk &C : M.Chunks)
S += walk(C);
return S;
}
Score walk(DirectiveMap::Conditional &C) {
Score Best;
bool MayTakeTrivial = true;
bool TookTrivial = false;
for (unsigned I = 0; I < C.Branches.size(); ++I) {
// Walk the branch to make its nested choices in any case.
Score BranchScore = walk(C.Branches[I].second);
// If we already took an #if 1, don't consider any other branches.
if (TookTrivial)
continue;
// Is this a trivial #if 0 or #if 1?
if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
if (!*TriviallyTaken)
continue; // Don't consider #if 0 even if it scores well.
if (MayTakeTrivial)
TookTrivial = true;
} else {
// After a nontrivial condition, #elif 1 isn't guaranteed taken.
MayTakeTrivial = false;
}
// Is this the best branch so far? (Including if it's #if 1).
if (TookTrivial || !C.Taken.hasValue() || BranchScore > Best) {
Best = BranchScore;
C.Taken = I;
}
}
return Best;
}
// Return true if the directive starts an always-taken conditional branch,
// false if the branch is never taken, and None otherwise.
llvm::Optional<bool> isTakenWhenReached(const DirectiveMap::Directive &Dir) {
switch (Dir.Kind) {
case clang::tok::pp_if:
case clang::tok::pp_elif:
break; // handled below
case clang::tok::pp_else:
return true;
default: // #ifdef etc
return llvm::None;
}
const auto &Tokens = Code.tokens(Dir.Tokens);
assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
const Token &Name = Tokens.front().nextNC();
const Token &Value = Name.nextNC();
// Does the condition consist of exactly one token?
if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
return llvm::None;
return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
.Cases("true", "1", true)
.Cases("false", "0", false)
.Default(llvm::None);
}
const TokenStream &Code;
};
} // namespace
void chooseConditionalBranches(DirectiveMap &Map, const TokenStream &Code) {
BranchChooser{Code}.choose(Map);
}
} // namespace pseudo
} // namespace clang

View File

@ -41,7 +41,7 @@ TOKEN-NEXT: r_brace 6:0 "}" flags=1
RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
PPS: code (5 tokens)
PPS-NEXT: #ifndef (3 tokens)
PPS-NEXT: #ifndef (3 tokens) TAKEN
PPS-NEXT: code (4 tokens)
PPS-NEXT: #else (2 tokens)
PPS-NEXT: code (3 tokens)

View File

@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
clang::LangOptions LangOpts; // FIXME: use real options.
auto Stream = clang::pseudo::lex(Text, LangOpts);
auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
clang::pseudo::chooseConditionalBranches(Structure, Stream);
if (PrintDirectiveMap)
llvm::outs() << Structure;

View File

@ -145,6 +145,162 @@ TEST(DirectiveMap, ParseBroken) {
EXPECT_EQ(0u, X.End.Tokens.size());
}
TEST(DirectiveMap, ChooseBranches) {
LangOptions Opts;
const std::string Cases[] = {
R"cpp(
// Branches with no alternatives are taken
#if COND // TAKEN
int x;
#endif
)cpp",
R"cpp(
// Empty branches are better than nothing
#if COND // TAKEN
#endif
)cpp",
R"cpp(
// Trivially false branches are not taken, even with no alternatives.
#if 0
int x;
#endif
)cpp",
R"cpp(
// Longer branches are preferred over shorter branches
#if COND // TAKEN
int x = 1;
#else
int x;
#endif
#if COND
int x;
#else // TAKEN
int x = 1;
#endif
)cpp",
R"cpp(
// Trivially true branches are taken if previous branches are trivial.
#if 1 // TAKEN
#else
int x = 1;
#endif
#if 0
int x = 1;
#elif 0
int x = 2;
#elif 1 // TAKEN
int x;
#endif
#if 0
int x = 1;
#elif FOO // TAKEN
int x = 2;
#elif 1
int x;
#endif
)cpp",
R"cpp(
// #else is a trivially true branch
#if 0
int x = 1;
#elif 0
int x = 2;
#else // TAKEN
int x;
#endif
)cpp",
R"cpp(
// Directives break ties, but nondirective text is more important.
#if FOO
#define A 1 2 3
#else // TAKEN
#define B 4 5 6
#define C 7 8 9
#endif
#if FOO // TAKEN
;
#define A 1 2 3
#else
#define B 4 5 6
#define C 7 8 9
#endif
)cpp",
R"cpp(
// Avoid #error directives.
#if FOO
int x = 42;
#error This branch is no good
#else // TAKEN
#endif
#if FOO
// All paths here lead to errors.
int x = 42;
#if 1 // TAKEN
#if COND // TAKEN
#error This branch is no good
#else
#error This one is no good either
#endif
#endif
#else // TAKEN
#endif
)cpp",
R"cpp(
// Populate taken branches recursively.
#if FOO // TAKEN
int x = 42;
#if BAR
;
#else // TAKEN
int y = 43;
#endif
#else
int x;
#if BAR // TAKEN
int y;
#else
;
#endif
#endif
)cpp",
};
for (const auto &Code : Cases) {
TokenStream S = cook(lex(Code, Opts), Opts);
std::function<void(const DirectiveMap &)> Verify =
[&](const DirectiveMap &M) {
for (const auto &C : M.Chunks) {
if (C.kind() != DirectiveMap::Chunk::K_Conditional)
continue;
const DirectiveMap::Conditional &Cond(C);
for (unsigned I = 0; I < Cond.Branches.size(); ++I) {
auto Directive = S.tokens(Cond.Branches[I].first.Tokens);
EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN")
<< "At line " << Directive.front().Line << " of: " << Code;
Verify(Cond.Branches[I].second);
}
}
};
DirectiveMap Map = DirectiveMap::parse(S);
chooseConditionalBranches(Map, S);
Verify(Map);
}
}
} // namespace
} // namespace pseudo
} // namespace clang