From 0360b9f1599b0b13f164d8170a619b19f9cb8bb4 Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Wed, 18 May 2022 19:24:07 +0200 Subject: [PATCH] [pseudo] (trivial) bracket-matching Error-tolerant bracket matching enables our error-tolerant parsing strategies. The implementation here is *not* yet error tolerant: this patch sets up the APIs and plumbing, and describes the planned approach. Differential Revision: https://reviews.llvm.org/D125911 --- .../pseudo/benchmarks/Benchmark.cpp | 15 +- .../pseudo/include/clang-pseudo/Bracket.h | 41 +++++ .../pseudo/include/clang-pseudo/Token.h | 11 +- clang-tools-extra/pseudo/lib/Bracket.cpp | 155 ++++++++++++++++++ clang-tools-extra/pseudo/lib/CMakeLists.txt | 1 + clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 2 + .../pseudo/unittests/BracketTest.cpp | 117 +++++++++++++ .../pseudo/unittests/CMakeLists.txt | 2 + 8 files changed, 342 insertions(+), 2 deletions(-) create mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h create mode 100644 clang-tools-extra/pseudo/lib/Bracket.cpp create mode 100644 clang-tools-extra/pseudo/unittests/BracketTest.cpp diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp index fb028be0c7ab..b10ff3a175bd 100644 --- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp @@ -20,6 +20,7 @@ //===----------------------------------------------------------------------===// #include "benchmark/benchmark.h" +#include "clang-pseudo/Bracket.h" #include "clang-pseudo/DirectiveTree.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/GLR.h" @@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() { chooseConditionalBranches(DirectiveStructure, RawStream); TokenStream Cook = cook(DirectiveStructure.stripDirectives(RawStream), LangOpts); - return stripComments(Cook); + auto Stream = stripComments(Cook); + pairBrackets(Stream); + return Stream; } static void lex(benchmark::State &State) { @@ -101,6 +104,16 @@ static void lex(benchmark::State &State) { } BENCHMARK(lex); +static void pairBrackets(benchmark::State &State) { + clang::LangOptions LangOpts = genericLangOpts(); + auto Stream = clang::pseudo::lex(*SourceText, LangOpts); + for (auto _ : State) + pairBrackets(Stream); + State.SetBytesProcessed(static_cast(State.iterations()) * + SourceText->size()); +} +BENCHMARK(pairBrackets); + static void preprocess(benchmark::State &State) { clang::LangOptions LangOpts = genericLangOpts(); TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts); diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h new file mode 100644 index 000000000000..268cfff1ab07 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h @@ -0,0 +1,41 @@ +//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Bracket structure (particularly braces) is key to isolating broken regions +// of code and preventing parsing from going "off the rails". +// +// For correct C++ code, brackets are well-nested and identifying pairs and +// therefore blocks is simple. In broken code, brackets are not properly nested. +// We cannot match them all and must choose which pairs to form. +// +// Rather than have the grammar-based parser make these choices, we pair +// brackets up-front based on textual features like indentation. +// This mirrors the way humans read code, and so is likely to produce the +// "correct" interpretation of broken code. +// +// This interpretation then guides the parse: a rule containing a bracket pair +// must match against paired bracket tokens. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_BRACKET_H +#define CLANG_PSEUDO_BRACKET_H + +#include "clang-pseudo/Token.h" + +namespace clang { +namespace pseudo { + +/// Identifies bracket token in the stream which should be paired. +/// Sets Token::Pair accordingly. +void pairBrackets(TokenStream &); + +} // namespace pseudo +} // namespace clang + +#endif diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h index 1750f547abd1..b558891f0a86 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -88,11 +88,15 @@ struct Token { while (T->Kind == tok::comment); return *T; } + /// Returns the bracket paired with this one, if any. + const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; } /// The type of token as determined by clang's lexer. clang::tok::TokenKind Kind = clang::tok::unknown; + /// If this token is a paired bracket, the offset of the pair in the stream. + int32_t Pair = 0; }; -static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!"); +static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!"); llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); /// A half-open range of tokens within a stream. @@ -155,6 +159,11 @@ public: return tokens().slice(R.Begin, R.End - R.Begin); } + MutableArrayRef tokens() { + assert(isFinalized()); + return Tokens; + } + /// May return the end sentinel if the stream is empty. const Token &front() const { assert(isFinalized()); diff --git a/clang-tools-extra/pseudo/lib/Bracket.cpp b/clang-tools-extra/pseudo/lib/Bracket.cpp new file mode 100644 index 000000000000..07836146ad8a --- /dev/null +++ b/clang-tools-extra/pseudo/lib/Bracket.cpp @@ -0,0 +1,155 @@ +//===--- Bracket.cpp - Analyze bracket structure --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The basic phases of our bracket matching are: +// +// 1) A simple "greedy" match looks for well-nested subsequences. +// +// We can't fully trust the results of this, consider: +// while (1) { // A +// if (true) { // B +// break; +// } // C +// Greedy matching will match B=C, when we should at least consider A=C. +// However for the correct parts of the file, the greedy match gives the +// right answer. It produces useful candidates for phase 2. +// +// simplePairBrackets handles this step. +// +// 2) Try to identify places where formatting indicates that the greedy match +// was correct. This is similar to how a human would scan a large file. +// +// For example: +// int foo() { // X +// // indented +// while (1) { +// // valid code +// } +// return bar(42); +// } // Y +// We can "verify" that X..Y looks like a braced block, and the greedy match +// tells us that substring is perfectly nested. +// We trust the pairings of those brackets and don't examine them further. +// However in the first example above, we do not trust B=C because the brace +// indentation is suspect. +// +// FIXME: implement this step. +// +// 3) Run full best-match optimization on remaining brackets. +// +// Conceptually, this considers all possible matchings and optimizes cost: +// - there is a cost for failing to match a bracket +// - there is a variable cost for matching two brackets. +// (For example if brace indentation doesn't match). +// +// In the first example we have three alternatives, and they are ranked: +// 1) A=C, skip B +// 2) B=C, skip A +// 3) skip A, skip B, skip C +// The cost for skipping a bracket is high, so option 3 is worst. +// B=C costs more than A=C, because the indentation doesn't match. +// +// It would be correct to run this step alone, but it would be too slow. +// The implementation is dynamic programming in N^3 space and N^2 time. +// Having earlier steps filter out most brackets is key to performance. +// +// FIXME: implement this step. +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Bracket.h" + +namespace clang { +namespace pseudo { +namespace { + +struct Bracket { + using Index = unsigned; + constexpr static Index None = -1; + + enum BracketKind : char { Paren, Brace, Square } Kind; + enum Direction : bool { Open, Close } Dir; + unsigned Line; + unsigned Indent; + Token::Index Tok; + Bracket::Index Pair = None; +}; + +// Find brackets in the stream and convert to Bracket struct. +std::vector findBrackets(const TokenStream &Stream) { + std::vector Brackets; + auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K, + Bracket::Direction D) { + Brackets.push_back( + {K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None}); + }; + for (const auto &Tok : Stream.tokens()) { + switch (Tok.Kind) { + case clang::tok::l_paren: + Add(Tok, Bracket::Paren, Bracket::Open); + break; + case clang::tok::r_paren: + Add(Tok, Bracket::Paren, Bracket::Close); + break; + case clang::tok::l_brace: + Add(Tok, Bracket::Brace, Bracket::Open); + break; + case clang::tok::r_brace: + Add(Tok, Bracket::Brace, Bracket::Close); + break; + case clang::tok::l_square: + Add(Tok, Bracket::Square, Bracket::Open); + break; + case clang::tok::r_square: + Add(Tok, Bracket::Square, Bracket::Close); + break; + default: + break; + } + } + return Brackets; +} + +// Write the bracket pairings from Brackets back to Tokens. +void applyPairings(ArrayRef Brackets, TokenStream &Tokens) { + for (const auto &B : Brackets) + Tokens.tokens()[B.Tok].Pair = + (B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok; +} + +// Find perfect pairings (ignoring whitespace) via greedy algorithm. +// This means two brackets are paired if they match and the brackets between +// them nest perfectly, with no skipped or crossed brackets. +void simplePairBrackets(MutableArrayRef Brackets) { + std::vector Stack; + for (unsigned I = 0; I < Brackets.size(); ++I) { + if (Brackets[I].Dir == Bracket::Open) { + Stack.push_back(I); + } else if (!Stack.empty() && + Brackets[Stack.back()].Kind == Brackets[I].Kind) { + Brackets[Stack.back()].Pair = I; + Brackets[I].Pair = Stack.back(); + Stack.pop_back(); + } else { + // Unpaired closer, no brackets on stack are part of a perfect sequence. + Stack.clear(); + } + } + // Any remaining brackets on the stack stay unpaired. +} + +} // namespace + +void pairBrackets(TokenStream &Stream) { + auto Brackets = findBrackets(Stream); + simplePairBrackets(Brackets); + applyPairings(Brackets, Stream); +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt index b11d2dd12e28..6dc8ed5b5e7a 100644 --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_LINK_COMPONENTS Support) add_clang_library(clangPseudo + Bracket.cpp DirectiveTree.cpp Forest.cpp GLR.cpp diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp index 5a6956df1f70..1d3ab19b3c09 100644 --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "clang-pseudo/Bracket.h" #include "clang-pseudo/DirectiveTree.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Grammar.h" @@ -89,6 +90,7 @@ int main(int argc, char *argv[]) { llvm::outs() << DirectiveStructure; ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts)); + pairBrackets(*ParseableStream); } if (Grammar.getNumOccurrences()) { diff --git a/clang-tools-extra/pseudo/unittests/BracketTest.cpp b/clang-tools-extra/pseudo/unittests/BracketTest.cpp new file mode 100644 index 000000000000..1247ddbd49a1 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/BracketTest.cpp @@ -0,0 +1,117 @@ +//===--- BracketTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Bracket.h" +#include "clang-pseudo/Token.h" +#include "clang/Basic/LangOptions.h" +#include "llvm/Testing/Support/Annotations.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace pseudo { + +// Return a version of Code with each paired bracket marked with ^. +std::string decorate(llvm::StringRef Code, const TokenStream &Stream) { + std::string Result; + const char *Pos = Code.data(); + for (const Token &Tok : Stream.tokens()) { + if (Tok.Pair == 0) + continue; + const char *NewPos = Tok.text().begin(); + assert(NewPos >= Code.begin() && NewPos < Code.end()); + Result.append(Pos, NewPos - Pos); + Result.push_back('^'); + Pos = NewPos; + } + Result.append(Pos, Code.end() - Pos); + return Result; +} + +// Checks that the brackets matched in Stream are those annotated in MarkedCode. +void verifyMatchedSet(llvm::StringRef Code, llvm::StringRef MarkedCode, + const TokenStream &Stream) { + EXPECT_EQ(MarkedCode, decorate(Code, Stream)); +} + +// Checks that paired brackets within the stream nest properly. +void verifyNesting(const TokenStream &Stream) { + std::vector Stack; + for (const auto &Tok : Stream.tokens()) { + if (Tok.Pair > 0) + Stack.push_back(&Tok); + else if (Tok.Pair < 0) { + ASSERT_FALSE(Stack.empty()) << Tok; + ASSERT_EQ(Stack.back(), Tok.pair()) + << *Stack.back() << " != " << *Tok.pair() << " = pair of " << Tok; + Stack.pop_back(); + } + } + ASSERT_THAT(Stack, testing::IsEmpty()); +} + +// Checks that ( pairs with a ) on its right, etc. +void verifyMatchKind(const TokenStream &Stream) { + for (const auto &Tok : Stream.tokens()) { + if (Tok.Pair == 0) + continue; + auto Want = [&]() -> std::pair { + switch (Tok.Kind) { + case tok::l_paren: + return {true, tok::r_paren}; + case tok::r_paren: + return {false, tok::l_paren}; + case tok::l_brace: + return {true, tok::r_brace}; + case tok::r_brace: + return {false, tok::l_brace}; + case tok::l_square: + return {true, tok::r_square}; + case tok::r_square: + return {false, tok::l_square}; + default: + ADD_FAILURE() << "Paired non-bracket " << Tok; + return {false, tok::eof}; + } + }(); + EXPECT_EQ(Tok.Pair > 0, Want.first) << Tok; + EXPECT_EQ(Tok.pair()->Kind, Want.second) << Tok; + } +} + +// Verifies an expected bracket pairing like: +// ^( [ ^) +// The input is annotated code, with the brackets expected to be matched marked. +// +// The input doesn't specify which bracket matches with which, but we verify: +// - exactly the marked subset are paired +// - ( is paired to a later ), etc +// - brackets properly nest +// This uniquely determines the bracket structure, so we indirectly verify it. +// If particular tests should emphasize which brackets are paired, use comments. +void verifyBrackets(llvm::StringRef MarkedCode) { + SCOPED_TRACE(MarkedCode); + llvm::Annotations A(MarkedCode); + std::string Code = A.code().str(); + LangOptions LangOpts; + auto Stream = lex(Code, LangOpts); + pairBrackets(Stream); + + verifyMatchedSet(Code, MarkedCode, Stream); + verifyNesting(Stream); + verifyMatchKind(Stream); +} + +TEST(Bracket, SimplePair) { + verifyBrackets("^{ ^[ ^( ^) ^( ^) ^] ^}"); + verifyBrackets(") ^{ ^[ ^] ^} ("); + verifyBrackets("{ [ ( ] }"); // FIXME +} + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt index aba8a1667489..73b13984d93e 100644 --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -1,9 +1,11 @@ set(LLVM_LINK_COMPONENTS Support + TestingSupport ) add_custom_target(ClangPseudoUnitTests) add_unittest(ClangPseudoUnitTests ClangPseudoTests + BracketTest.cpp DirectiveTreeTest.cpp ForestTest.cpp GLRTest.cpp