From af89e4792d23779969c70284dcf6cfafa411637c Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Mon, 7 Mar 2022 23:33:40 +0100 Subject: [PATCH] [pseudo] Add crude heuristics to choose taken preprocessor branches. In files where different preprocessing paths are possible, our goal is to choose a preprocessed token sequence which we can parse that pins down as much of the grammatical structure as possible. This forms the "primary parse", and the not-taken branches get parsed later, and are constrained to be compatible with the primary parse. Concretely: int x = #ifdef // TAKEN 2 + 2 + 2 // determined during primary parse to be an expression #else 2 // constrained to be an expression during a secondary parse #endif ; Differential Revision: https://reviews.llvm.org/D121165 --- .../include/clang-pseudo/DirectiveMap.h | 24 ++- .../pseudo/include/clang-pseudo/Token.h | 14 ++ clang-tools-extra/pseudo/lib/DirectiveMap.cpp | 156 +++++++++++++++++- clang-tools-extra/pseudo/test/lex.c | 2 +- clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 1 + .../pseudo/unittests/DirectiveMapTest.cpp | 156 ++++++++++++++++++ 6 files changed, 345 insertions(+), 8 deletions(-) diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h index 14c752d29040..67e815e67efd 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveMap.h @@ -78,6 +78,11 @@ struct DirectiveMap { std::vector> Branches; /// The directive terminating the conditional, should be #endif. Directive End; + /// The index of the conditional branch we chose as active. + /// None indicates no branch was taken (e.g. #if 0 ... #endif). + /// The initial map from of `parse()` has no branches marked as taken. + /// See `chooseConditionalBranches()`. + llvm::Optional Taken; }; /// Some piece of the file. {One of Code, Directive, Conditional}. @@ -87,7 +92,6 @@ struct DirectiveMap { /// Extract preprocessor structure by examining the raw tokens. static DirectiveMap parse(const TokenStream &); - // FIXME: add heuristically selection of conditional branches. // FIXME: allow deriving a preprocessed stream }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &); @@ -98,6 +102,24 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &, llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Conditional &); +/// Selects a "taken" branch for each conditional directive in the file. +/// +/// The choice is somewhat arbitrary, but aims to produce a useful parse: +/// - idioms like `#if 0` are respected +/// - we avoid paths that reach `#error` +/// - we try to maximize the amount of code seen +/// The choice may also be "no branch taken". +/// +/// Choices are also made for conditionals themselves inside not-taken branches: +/// #if 1 // taken! +/// #else // not taken +/// #if 1 // taken! +/// #endif +/// #endif +/// +/// The choices are stored in Conditional::Taken nodes. +void chooseConditionalBranches(DirectiveMap &, const TokenStream &Code); + // FIXME: This approximates std::variant. // Switch once we can use C++17. class DirectiveMap::Chunk { diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h index 4563477b2c4f..2bbd598736e6 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -74,6 +74,20 @@ struct Token { Flags |= uint8_t{static_cast>(Mask)}; } + /// Returns the next token in the stream. this may not be a sentinel. + const Token &next() const { + assert(Kind != tok::eof); + return *(this + 1); + } + /// Returns the next token in the stream, skipping over comments. + const Token &nextNC() const { + const Token *T = this; + do + T = &T->next(); + while (T->Kind == tok::comment); + return *T; + } + /// The type of token as determined by clang's lexer. clang::tok::TokenKind Kind = clang::tok::unknown; }; diff --git a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp index 6e6670881895..f25231c9b7f2 100644 --- a/clang-tools-extra/pseudo/lib/DirectiveMap.cpp +++ b/clang-tools-extra/pseudo/lib/DirectiveMap.cpp @@ -151,10 +151,11 @@ DirectiveMap DirectiveMap::parse(const TokenStream &Code) { static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent); static void dump(llvm::raw_ostream &OS, - const DirectiveMap::Directive &Directive, unsigned Indent) { - OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n", - tok::getPPKeywordSpelling(Directive.Kind), - Directive.Tokens.size()); + const DirectiveMap::Directive &Directive, unsigned Indent, + bool Taken = false) { + OS.indent(Indent) << llvm::formatv( + "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind), + Directive.Tokens.size(), Taken ? " TAKEN" : ""); } static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code, unsigned Indent) { @@ -163,8 +164,9 @@ static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code, static void dump(llvm::raw_ostream &OS, const DirectiveMap::Conditional &Conditional, unsigned Indent) { - for (const auto &Branch : Conditional.Branches) { - dump(OS, Branch.first, Indent); + for (unsigned I = 0; I < Conditional.Branches.size(); ++I) { + const auto &Branch = Conditional.Branches[I]; + dump(OS, Branch.first, Indent, Conditional.Taken == I); dump(OS, Branch.second, Indent + 2); } dump(OS, Conditional.End, Indent); @@ -203,5 +205,147 @@ OSTREAM_DUMP(DirectiveMap::Conditional) OSTREAM_DUMP(DirectiveMap::Code) #undef OSTREAM_DUMP +namespace { +// Makes choices about conditional branches. +// +// Generally it tries to maximize the amount of useful code we see. +// +// Caveat: each conditional is evaluated independently. Consider this code: +// #ifdef WINDOWS +// bool isWindows = true; +// #endif +// #ifndef WINDOWS +// bool isWindows = false; +// #endif +// We take both branches and define isWindows twice. We could track more state +// in order to produce a consistent view, but this is complex. +class BranchChooser { +public: + BranchChooser(const TokenStream &Code) : Code(Code) {} + + void choose(DirectiveMap &M) { walk(M); } + +private: + // Describes code seen by making particular branch choices. Higher is better. + struct Score { + int Tokens = 0; // excluding comments and directives + int Directives = 0; + int Errors = 0; // #error directives + + bool operator>(const Score &Other) const { + // Seeing errors is bad, other things are good. + return std::make_tuple(-Errors, Tokens, Directives) > + std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives); + } + + Score &operator+=(const Score &Other) { + Tokens += Other.Tokens; + Directives += Other.Directives; + Errors += Other.Errors; + return *this; + } + }; + + Score walk(DirectiveMap::Code &C) { + Score S; + for (const Token &T : Code.tokens(C.Tokens)) + if (T.Kind != tok::comment) + ++S.Tokens; + return S; + } + + Score walk(DirectiveMap::Directive &D) { + Score S; + S.Directives = 1; + S.Errors = D.Kind == tok::pp_error; + return S; + } + + Score walk(DirectiveMap::Chunk &C) { + switch (C.kind()) { + case DirectiveMap::Chunk::K_Code: + return walk((DirectiveMap::Code &)C); + case DirectiveMap::Chunk::K_Directive: + return walk((DirectiveMap::Directive &)C); + case DirectiveMap::Chunk::K_Conditional: + return walk((DirectiveMap::Conditional &)C); + case DirectiveMap::Chunk::K_Empty: + break; + } + llvm_unreachable("bad chunk kind"); + } + + Score walk(DirectiveMap &M) { + Score S; + for (DirectiveMap::Chunk &C : M.Chunks) + S += walk(C); + return S; + } + + Score walk(DirectiveMap::Conditional &C) { + Score Best; + bool MayTakeTrivial = true; + bool TookTrivial = false; + + for (unsigned I = 0; I < C.Branches.size(); ++I) { + // Walk the branch to make its nested choices in any case. + Score BranchScore = walk(C.Branches[I].second); + // If we already took an #if 1, don't consider any other branches. + if (TookTrivial) + continue; + // Is this a trivial #if 0 or #if 1? + if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) { + if (!*TriviallyTaken) + continue; // Don't consider #if 0 even if it scores well. + if (MayTakeTrivial) + TookTrivial = true; + } else { + // After a nontrivial condition, #elif 1 isn't guaranteed taken. + MayTakeTrivial = false; + } + // Is this the best branch so far? (Including if it's #if 1). + if (TookTrivial || !C.Taken.hasValue() || BranchScore > Best) { + Best = BranchScore; + C.Taken = I; + } + } + return Best; + } + + // Return true if the directive starts an always-taken conditional branch, + // false if the branch is never taken, and None otherwise. + llvm::Optional isTakenWhenReached(const DirectiveMap::Directive &Dir) { + switch (Dir.Kind) { + case clang::tok::pp_if: + case clang::tok::pp_elif: + break; // handled below + case clang::tok::pp_else: + return true; + default: // #ifdef etc + return llvm::None; + } + + const auto &Tokens = Code.tokens(Dir.Tokens); + assert(!Tokens.empty() && Tokens.front().Kind == tok::hash); + const Token &Name = Tokens.front().nextNC(); + const Token &Value = Name.nextNC(); + // Does the condition consist of exactly one token? + if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end()) + return llvm::None; + return llvm::StringSwitch>(Value.text()) + .Cases("true", "1", true) + .Cases("false", "0", false) + .Default(llvm::None); + } + + const TokenStream &Code; +}; + +} // namespace + +void chooseConditionalBranches(DirectiveMap &Map, const TokenStream &Code) { + BranchChooser{Code}.choose(Map); +} + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c index 39b114e8d961..0bba84c91f40 100644 --- a/clang-tools-extra/pseudo/test/lex.c +++ b/clang-tools-extra/pseudo/test/lex.c @@ -41,7 +41,7 @@ TOKEN-NEXT: r_brace 6:0 "}" flags=1 RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace PPS: code (5 tokens) -PPS-NEXT: #ifndef (3 tokens) +PPS-NEXT: #ifndef (3 tokens) TAKEN PPS-NEXT: code (4 tokens) PPS-NEXT: #else (2 tokens) PPS-NEXT: code (3 tokens) diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp index e893ed2fb03e..fd5eb60d5eb3 100644 --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -75,6 +75,7 @@ int main(int argc, char *argv[]) { clang::LangOptions LangOpts; // FIXME: use real options. auto Stream = clang::pseudo::lex(Text, LangOpts); auto Structure = clang::pseudo::DirectiveMap::parse(Stream); + clang::pseudo::chooseConditionalBranches(Structure, Stream); if (PrintDirectiveMap) llvm::outs() << Structure; diff --git a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp index 1feae342655b..010451fd7dc3 100644 --- a/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp +++ b/clang-tools-extra/pseudo/unittests/DirectiveMapTest.cpp @@ -145,6 +145,162 @@ TEST(DirectiveMap, ParseBroken) { EXPECT_EQ(0u, X.End.Tokens.size()); } +TEST(DirectiveMap, ChooseBranches) { + LangOptions Opts; + const std::string Cases[] = { + R"cpp( + // Branches with no alternatives are taken + #if COND // TAKEN + int x; + #endif + )cpp", + + R"cpp( + // Empty branches are better than nothing + #if COND // TAKEN + #endif + )cpp", + + R"cpp( + // Trivially false branches are not taken, even with no alternatives. + #if 0 + int x; + #endif + )cpp", + + R"cpp( + // Longer branches are preferred over shorter branches + #if COND // TAKEN + int x = 1; + #else + int x; + #endif + + #if COND + int x; + #else // TAKEN + int x = 1; + #endif + )cpp", + + R"cpp( + // Trivially true branches are taken if previous branches are trivial. + #if 1 // TAKEN + #else + int x = 1; + #endif + + #if 0 + int x = 1; + #elif 0 + int x = 2; + #elif 1 // TAKEN + int x; + #endif + + #if 0 + int x = 1; + #elif FOO // TAKEN + int x = 2; + #elif 1 + int x; + #endif + )cpp", + + R"cpp( + // #else is a trivially true branch + #if 0 + int x = 1; + #elif 0 + int x = 2; + #else // TAKEN + int x; + #endif + )cpp", + + R"cpp( + // Directives break ties, but nondirective text is more important. + #if FOO + #define A 1 2 3 + #else // TAKEN + #define B 4 5 6 + #define C 7 8 9 + #endif + + #if FOO // TAKEN + ; + #define A 1 2 3 + #else + #define B 4 5 6 + #define C 7 8 9 + #endif + )cpp", + + R"cpp( + // Avoid #error directives. + #if FOO + int x = 42; + #error This branch is no good + #else // TAKEN + #endif + + #if FOO + // All paths here lead to errors. + int x = 42; + #if 1 // TAKEN + #if COND // TAKEN + #error This branch is no good + #else + #error This one is no good either + #endif + #endif + #else // TAKEN + #endif + )cpp", + + R"cpp( + // Populate taken branches recursively. + #if FOO // TAKEN + int x = 42; + #if BAR + ; + #else // TAKEN + int y = 43; + #endif + #else + int x; + #if BAR // TAKEN + int y; + #else + ; + #endif + #endif + )cpp", + }; + for (const auto &Code : Cases) { + TokenStream S = cook(lex(Code, Opts), Opts); + + std::function Verify = + [&](const DirectiveMap &M) { + for (const auto &C : M.Chunks) { + if (C.kind() != DirectiveMap::Chunk::K_Conditional) + continue; + const DirectiveMap::Conditional &Cond(C); + for (unsigned I = 0; I < Cond.Branches.size(); ++I) { + auto Directive = S.tokens(Cond.Branches[I].first.Tokens); + EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN") + << "At line " << Directive.front().Line << " of: " << Code; + Verify(Cond.Branches[I].second); + } + } + }; + + DirectiveMap Map = DirectiveMap::parse(S); + chooseConditionalBranches(Map, S); + Verify(Map); + } +} + } // namespace } // namespace pseudo } // namespace clang