forked from OSchip/llvm-project
[pseudo] Add crude heuristics to choose taken preprocessor branches.
In files where different preprocessing paths are possible, our goal is to choose a preprocessed token sequence which we can parse that pins down as much of the grammatical structure as possible. This forms the "primary parse", and the not-taken branches get parsed later, and are constrained to be compatible with the primary parse. Concretely: int x = #ifdef // TAKEN 2 + 2 + 2 // determined during primary parse to be an expression #else 2 // constrained to be an expression during a secondary parse #endif ; Differential Revision: https://reviews.llvm.org/D121165
This commit is contained in:
parent
f4f1cf6c31
commit
af89e4792d
|
@ -78,6 +78,11 @@ struct DirectiveMap {
|
|||
std::vector<std::pair<Directive, DirectiveMap>> Branches;
|
||||
/// The directive terminating the conditional, should be #endif.
|
||||
Directive End;
|
||||
/// The index of the conditional branch we chose as active.
|
||||
/// None indicates no branch was taken (e.g. #if 0 ... #endif).
|
||||
/// The initial map from of `parse()` has no branches marked as taken.
|
||||
/// See `chooseConditionalBranches()`.
|
||||
llvm::Optional<unsigned> Taken;
|
||||
};
|
||||
|
||||
/// Some piece of the file. {One of Code, Directive, Conditional}.
|
||||
|
@ -87,7 +92,6 @@ struct DirectiveMap {
|
|||
/// Extract preprocessor structure by examining the raw tokens.
|
||||
static DirectiveMap parse(const TokenStream &);
|
||||
|
||||
// FIXME: add heuristically selection of conditional branches.
|
||||
// FIXME: allow deriving a preprocessed stream
|
||||
};
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
|
||||
|
@ -98,6 +102,24 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,
|
|||
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
|
||||
const DirectiveMap::Conditional &);
|
||||
|
||||
/// Selects a "taken" branch for each conditional directive in the file.
|
||||
///
|
||||
/// The choice is somewhat arbitrary, but aims to produce a useful parse:
|
||||
/// - idioms like `#if 0` are respected
|
||||
/// - we avoid paths that reach `#error`
|
||||
/// - we try to maximize the amount of code seen
|
||||
/// The choice may also be "no branch taken".
|
||||
///
|
||||
/// Choices are also made for conditionals themselves inside not-taken branches:
|
||||
/// #if 1 // taken!
|
||||
/// #else // not taken
|
||||
/// #if 1 // taken!
|
||||
/// #endif
|
||||
/// #endif
|
||||
///
|
||||
/// The choices are stored in Conditional::Taken nodes.
|
||||
void chooseConditionalBranches(DirectiveMap &, const TokenStream &Code);
|
||||
|
||||
// FIXME: This approximates std::variant<Code, Directive, Conditional>.
|
||||
// Switch once we can use C++17.
|
||||
class DirectiveMap::Chunk {
|
||||
|
|
|
@ -74,6 +74,20 @@ struct Token {
|
|||
Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
|
||||
}
|
||||
|
||||
/// Returns the next token in the stream. this may not be a sentinel.
|
||||
const Token &next() const {
|
||||
assert(Kind != tok::eof);
|
||||
return *(this + 1);
|
||||
}
|
||||
/// Returns the next token in the stream, skipping over comments.
|
||||
const Token &nextNC() const {
|
||||
const Token *T = this;
|
||||
do
|
||||
T = &T->next();
|
||||
while (T->Kind == tok::comment);
|
||||
return *T;
|
||||
}
|
||||
|
||||
/// The type of token as determined by clang's lexer.
|
||||
clang::tok::TokenKind Kind = clang::tok::unknown;
|
||||
};
|
||||
|
|
|
@ -151,10 +151,11 @@ DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
|
|||
|
||||
static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
|
||||
static void dump(llvm::raw_ostream &OS,
|
||||
const DirectiveMap::Directive &Directive, unsigned Indent) {
|
||||
OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
|
||||
tok::getPPKeywordSpelling(Directive.Kind),
|
||||
Directive.Tokens.size());
|
||||
const DirectiveMap::Directive &Directive, unsigned Indent,
|
||||
bool Taken = false) {
|
||||
OS.indent(Indent) << llvm::formatv(
|
||||
"#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
|
||||
Directive.Tokens.size(), Taken ? " TAKEN" : "");
|
||||
}
|
||||
static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
|
||||
unsigned Indent) {
|
||||
|
@ -163,8 +164,9 @@ static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
|
|||
static void dump(llvm::raw_ostream &OS,
|
||||
const DirectiveMap::Conditional &Conditional,
|
||||
unsigned Indent) {
|
||||
for (const auto &Branch : Conditional.Branches) {
|
||||
dump(OS, Branch.first, Indent);
|
||||
for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
|
||||
const auto &Branch = Conditional.Branches[I];
|
||||
dump(OS, Branch.first, Indent, Conditional.Taken == I);
|
||||
dump(OS, Branch.second, Indent + 2);
|
||||
}
|
||||
dump(OS, Conditional.End, Indent);
|
||||
|
@ -203,5 +205,147 @@ OSTREAM_DUMP(DirectiveMap::Conditional)
|
|||
OSTREAM_DUMP(DirectiveMap::Code)
|
||||
#undef OSTREAM_DUMP
|
||||
|
||||
namespace {
|
||||
// Makes choices about conditional branches.
|
||||
//
|
||||
// Generally it tries to maximize the amount of useful code we see.
|
||||
//
|
||||
// Caveat: each conditional is evaluated independently. Consider this code:
|
||||
// #ifdef WINDOWS
|
||||
// bool isWindows = true;
|
||||
// #endif
|
||||
// #ifndef WINDOWS
|
||||
// bool isWindows = false;
|
||||
// #endif
|
||||
// We take both branches and define isWindows twice. We could track more state
|
||||
// in order to produce a consistent view, but this is complex.
|
||||
class BranchChooser {
|
||||
public:
|
||||
BranchChooser(const TokenStream &Code) : Code(Code) {}
|
||||
|
||||
void choose(DirectiveMap &M) { walk(M); }
|
||||
|
||||
private:
|
||||
// Describes code seen by making particular branch choices. Higher is better.
|
||||
struct Score {
|
||||
int Tokens = 0; // excluding comments and directives
|
||||
int Directives = 0;
|
||||
int Errors = 0; // #error directives
|
||||
|
||||
bool operator>(const Score &Other) const {
|
||||
// Seeing errors is bad, other things are good.
|
||||
return std::make_tuple(-Errors, Tokens, Directives) >
|
||||
std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
|
||||
}
|
||||
|
||||
Score &operator+=(const Score &Other) {
|
||||
Tokens += Other.Tokens;
|
||||
Directives += Other.Directives;
|
||||
Errors += Other.Errors;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
Score walk(DirectiveMap::Code &C) {
|
||||
Score S;
|
||||
for (const Token &T : Code.tokens(C.Tokens))
|
||||
if (T.Kind != tok::comment)
|
||||
++S.Tokens;
|
||||
return S;
|
||||
}
|
||||
|
||||
Score walk(DirectiveMap::Directive &D) {
|
||||
Score S;
|
||||
S.Directives = 1;
|
||||
S.Errors = D.Kind == tok::pp_error;
|
||||
return S;
|
||||
}
|
||||
|
||||
Score walk(DirectiveMap::Chunk &C) {
|
||||
switch (C.kind()) {
|
||||
case DirectiveMap::Chunk::K_Code:
|
||||
return walk((DirectiveMap::Code &)C);
|
||||
case DirectiveMap::Chunk::K_Directive:
|
||||
return walk((DirectiveMap::Directive &)C);
|
||||
case DirectiveMap::Chunk::K_Conditional:
|
||||
return walk((DirectiveMap::Conditional &)C);
|
||||
case DirectiveMap::Chunk::K_Empty:
|
||||
break;
|
||||
}
|
||||
llvm_unreachable("bad chunk kind");
|
||||
}
|
||||
|
||||
Score walk(DirectiveMap &M) {
|
||||
Score S;
|
||||
for (DirectiveMap::Chunk &C : M.Chunks)
|
||||
S += walk(C);
|
||||
return S;
|
||||
}
|
||||
|
||||
Score walk(DirectiveMap::Conditional &C) {
|
||||
Score Best;
|
||||
bool MayTakeTrivial = true;
|
||||
bool TookTrivial = false;
|
||||
|
||||
for (unsigned I = 0; I < C.Branches.size(); ++I) {
|
||||
// Walk the branch to make its nested choices in any case.
|
||||
Score BranchScore = walk(C.Branches[I].second);
|
||||
// If we already took an #if 1, don't consider any other branches.
|
||||
if (TookTrivial)
|
||||
continue;
|
||||
// Is this a trivial #if 0 or #if 1?
|
||||
if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
|
||||
if (!*TriviallyTaken)
|
||||
continue; // Don't consider #if 0 even if it scores well.
|
||||
if (MayTakeTrivial)
|
||||
TookTrivial = true;
|
||||
} else {
|
||||
// After a nontrivial condition, #elif 1 isn't guaranteed taken.
|
||||
MayTakeTrivial = false;
|
||||
}
|
||||
// Is this the best branch so far? (Including if it's #if 1).
|
||||
if (TookTrivial || !C.Taken.hasValue() || BranchScore > Best) {
|
||||
Best = BranchScore;
|
||||
C.Taken = I;
|
||||
}
|
||||
}
|
||||
return Best;
|
||||
}
|
||||
|
||||
// Return true if the directive starts an always-taken conditional branch,
|
||||
// false if the branch is never taken, and None otherwise.
|
||||
llvm::Optional<bool> isTakenWhenReached(const DirectiveMap::Directive &Dir) {
|
||||
switch (Dir.Kind) {
|
||||
case clang::tok::pp_if:
|
||||
case clang::tok::pp_elif:
|
||||
break; // handled below
|
||||
case clang::tok::pp_else:
|
||||
return true;
|
||||
default: // #ifdef etc
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
const auto &Tokens = Code.tokens(Dir.Tokens);
|
||||
assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
|
||||
const Token &Name = Tokens.front().nextNC();
|
||||
const Token &Value = Name.nextNC();
|
||||
// Does the condition consist of exactly one token?
|
||||
if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
|
||||
return llvm::None;
|
||||
return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
|
||||
.Cases("true", "1", true)
|
||||
.Cases("false", "0", false)
|
||||
.Default(llvm::None);
|
||||
}
|
||||
|
||||
const TokenStream &Code;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void chooseConditionalBranches(DirectiveMap &Map, const TokenStream &Code) {
|
||||
BranchChooser{Code}.choose(Map);
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
||||
|
|
|
@ -41,7 +41,7 @@ TOKEN-NEXT: r_brace 6:0 "}" flags=1
|
|||
|
||||
RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
|
||||
PPS: code (5 tokens)
|
||||
PPS-NEXT: #ifndef (3 tokens)
|
||||
PPS-NEXT: #ifndef (3 tokens) TAKEN
|
||||
PPS-NEXT: code (4 tokens)
|
||||
PPS-NEXT: #else (2 tokens)
|
||||
PPS-NEXT: code (3 tokens)
|
||||
|
|
|
@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
|
|||
clang::LangOptions LangOpts; // FIXME: use real options.
|
||||
auto Stream = clang::pseudo::lex(Text, LangOpts);
|
||||
auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
|
||||
clang::pseudo::chooseConditionalBranches(Structure, Stream);
|
||||
|
||||
if (PrintDirectiveMap)
|
||||
llvm::outs() << Structure;
|
||||
|
|
|
@ -145,6 +145,162 @@ TEST(DirectiveMap, ParseBroken) {
|
|||
EXPECT_EQ(0u, X.End.Tokens.size());
|
||||
}
|
||||
|
||||
TEST(DirectiveMap, ChooseBranches) {
|
||||
LangOptions Opts;
|
||||
const std::string Cases[] = {
|
||||
R"cpp(
|
||||
// Branches with no alternatives are taken
|
||||
#if COND // TAKEN
|
||||
int x;
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Empty branches are better than nothing
|
||||
#if COND // TAKEN
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Trivially false branches are not taken, even with no alternatives.
|
||||
#if 0
|
||||
int x;
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Longer branches are preferred over shorter branches
|
||||
#if COND // TAKEN
|
||||
int x = 1;
|
||||
#else
|
||||
int x;
|
||||
#endif
|
||||
|
||||
#if COND
|
||||
int x;
|
||||
#else // TAKEN
|
||||
int x = 1;
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Trivially true branches are taken if previous branches are trivial.
|
||||
#if 1 // TAKEN
|
||||
#else
|
||||
int x = 1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
int x = 1;
|
||||
#elif 0
|
||||
int x = 2;
|
||||
#elif 1 // TAKEN
|
||||
int x;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
int x = 1;
|
||||
#elif FOO // TAKEN
|
||||
int x = 2;
|
||||
#elif 1
|
||||
int x;
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// #else is a trivially true branch
|
||||
#if 0
|
||||
int x = 1;
|
||||
#elif 0
|
||||
int x = 2;
|
||||
#else // TAKEN
|
||||
int x;
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Directives break ties, but nondirective text is more important.
|
||||
#if FOO
|
||||
#define A 1 2 3
|
||||
#else // TAKEN
|
||||
#define B 4 5 6
|
||||
#define C 7 8 9
|
||||
#endif
|
||||
|
||||
#if FOO // TAKEN
|
||||
;
|
||||
#define A 1 2 3
|
||||
#else
|
||||
#define B 4 5 6
|
||||
#define C 7 8 9
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Avoid #error directives.
|
||||
#if FOO
|
||||
int x = 42;
|
||||
#error This branch is no good
|
||||
#else // TAKEN
|
||||
#endif
|
||||
|
||||
#if FOO
|
||||
// All paths here lead to errors.
|
||||
int x = 42;
|
||||
#if 1 // TAKEN
|
||||
#if COND // TAKEN
|
||||
#error This branch is no good
|
||||
#else
|
||||
#error This one is no good either
|
||||
#endif
|
||||
#endif
|
||||
#else // TAKEN
|
||||
#endif
|
||||
)cpp",
|
||||
|
||||
R"cpp(
|
||||
// Populate taken branches recursively.
|
||||
#if FOO // TAKEN
|
||||
int x = 42;
|
||||
#if BAR
|
||||
;
|
||||
#else // TAKEN
|
||||
int y = 43;
|
||||
#endif
|
||||
#else
|
||||
int x;
|
||||
#if BAR // TAKEN
|
||||
int y;
|
||||
#else
|
||||
;
|
||||
#endif
|
||||
#endif
|
||||
)cpp",
|
||||
};
|
||||
for (const auto &Code : Cases) {
|
||||
TokenStream S = cook(lex(Code, Opts), Opts);
|
||||
|
||||
std::function<void(const DirectiveMap &)> Verify =
|
||||
[&](const DirectiveMap &M) {
|
||||
for (const auto &C : M.Chunks) {
|
||||
if (C.kind() != DirectiveMap::Chunk::K_Conditional)
|
||||
continue;
|
||||
const DirectiveMap::Conditional &Cond(C);
|
||||
for (unsigned I = 0; I < Cond.Branches.size(); ++I) {
|
||||
auto Directive = S.tokens(Cond.Branches[I].first.Tokens);
|
||||
EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN")
|
||||
<< "At line " << Directive.front().Line << " of: " << Code;
|
||||
Verify(Cond.Branches[I].second);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
DirectiveMap Map = DirectiveMap::parse(S);
|
||||
chooseConditionalBranches(Map, S);
|
||||
Verify(Map);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
||||
|
|
Loading…
Reference in New Issue