forked from OSchip/llvm-project
[pseudo] (trivial) bracket-matching
Error-tolerant bracket matching enables our error-tolerant parsing strategies. The implementation here is *not* yet error tolerant: this patch sets up the APIs and plumbing, and describes the planned approach. Differential Revision: https://reviews.llvm.org/D125911
This commit is contained in:
parent
f37101983f
commit
0360b9f159
|
@ -20,6 +20,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "clang-pseudo/Bracket.h"
|
||||
#include "clang-pseudo/DirectiveTree.h"
|
||||
#include "clang-pseudo/Forest.h"
|
||||
#include "clang-pseudo/GLR.h"
|
||||
|
@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() {
|
|||
chooseConditionalBranches(DirectiveStructure, RawStream);
|
||||
TokenStream Cook =
|
||||
cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
|
||||
return stripComments(Cook);
|
||||
auto Stream = stripComments(Cook);
|
||||
pairBrackets(Stream);
|
||||
return Stream;
|
||||
}
|
||||
|
||||
static void lex(benchmark::State &State) {
|
||||
|
@ -101,6 +104,16 @@ static void lex(benchmark::State &State) {
|
|||
}
|
||||
BENCHMARK(lex);
|
||||
|
||||
static void pairBrackets(benchmark::State &State) {
|
||||
clang::LangOptions LangOpts = genericLangOpts();
|
||||
auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
|
||||
for (auto _ : State)
|
||||
pairBrackets(Stream);
|
||||
State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
|
||||
SourceText->size());
|
||||
}
|
||||
BENCHMARK(pairBrackets);
|
||||
|
||||
static void preprocess(benchmark::State &State) {
|
||||
clang::LangOptions LangOpts = genericLangOpts();
|
||||
TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Bracket structure (particularly braces) is key to isolating broken regions
|
||||
// of code and preventing parsing from going "off the rails".
|
||||
//
|
||||
// For correct C++ code, brackets are well-nested and identifying pairs and
|
||||
// therefore blocks is simple. In broken code, brackets are not properly nested.
|
||||
// We cannot match them all and must choose which pairs to form.
|
||||
//
|
||||
// Rather than have the grammar-based parser make these choices, we pair
|
||||
// brackets up-front based on textual features like indentation.
|
||||
// This mirrors the way humans read code, and so is likely to produce the
|
||||
// "correct" interpretation of broken code.
|
||||
//
|
||||
// This interpretation then guides the parse: a rule containing a bracket pair
|
||||
// must match against paired bracket tokens.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef CLANG_PSEUDO_BRACKET_H
|
||||
#define CLANG_PSEUDO_BRACKET_H
|
||||
|
||||
#include "clang-pseudo/Token.h"
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
|
||||
/// Identifies bracket token in the stream which should be paired.
|
||||
/// Sets Token::Pair accordingly.
|
||||
void pairBrackets(TokenStream &);
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
||||
|
||||
#endif
|
|
@ -88,11 +88,15 @@ struct Token {
|
|||
while (T->Kind == tok::comment);
|
||||
return *T;
|
||||
}
|
||||
/// Returns the bracket paired with this one, if any.
|
||||
const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
|
||||
|
||||
/// The type of token as determined by clang's lexer.
|
||||
clang::tok::TokenKind Kind = clang::tok::unknown;
|
||||
/// If this token is a paired bracket, the offset of the pair in the stream.
|
||||
int32_t Pair = 0;
|
||||
};
|
||||
static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
|
||||
static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!");
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
|
||||
|
||||
/// A half-open range of tokens within a stream.
|
||||
|
@ -155,6 +159,11 @@ public:
|
|||
return tokens().slice(R.Begin, R.End - R.Begin);
|
||||
}
|
||||
|
||||
MutableArrayRef<Token> tokens() {
|
||||
assert(isFinalized());
|
||||
return Tokens;
|
||||
}
|
||||
|
||||
/// May return the end sentinel if the stream is empty.
|
||||
const Token &front() const {
|
||||
assert(isFinalized());
|
||||
|
|
|
@ -0,0 +1,155 @@
|
|||
//===--- Bracket.cpp - Analyze bracket structure --------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The basic phases of our bracket matching are:
|
||||
//
|
||||
// 1) A simple "greedy" match looks for well-nested subsequences.
|
||||
//
|
||||
// We can't fully trust the results of this, consider:
|
||||
// while (1) { // A
|
||||
// if (true) { // B
|
||||
// break;
|
||||
// } // C
|
||||
// Greedy matching will match B=C, when we should at least consider A=C.
|
||||
// However for the correct parts of the file, the greedy match gives the
|
||||
// right answer. It produces useful candidates for phase 2.
|
||||
//
|
||||
// simplePairBrackets handles this step.
|
||||
//
|
||||
// 2) Try to identify places where formatting indicates that the greedy match
|
||||
// was correct. This is similar to how a human would scan a large file.
|
||||
//
|
||||
// For example:
|
||||
// int foo() { // X
|
||||
// // indented
|
||||
// while (1) {
|
||||
// // valid code
|
||||
// }
|
||||
// return bar(42);
|
||||
// } // Y
|
||||
// We can "verify" that X..Y looks like a braced block, and the greedy match
|
||||
// tells us that substring is perfectly nested.
|
||||
// We trust the pairings of those brackets and don't examine them further.
|
||||
// However in the first example above, we do not trust B=C because the brace
|
||||
// indentation is suspect.
|
||||
//
|
||||
// FIXME: implement this step.
|
||||
//
|
||||
// 3) Run full best-match optimization on remaining brackets.
|
||||
//
|
||||
// Conceptually, this considers all possible matchings and optimizes cost:
|
||||
// - there is a cost for failing to match a bracket
|
||||
// - there is a variable cost for matching two brackets.
|
||||
// (For example if brace indentation doesn't match).
|
||||
//
|
||||
// In the first example we have three alternatives, and they are ranked:
|
||||
// 1) A=C, skip B
|
||||
// 2) B=C, skip A
|
||||
// 3) skip A, skip B, skip C
|
||||
// The cost for skipping a bracket is high, so option 3 is worst.
|
||||
// B=C costs more than A=C, because the indentation doesn't match.
|
||||
//
|
||||
// It would be correct to run this step alone, but it would be too slow.
|
||||
// The implementation is dynamic programming in N^3 space and N^2 time.
|
||||
// Having earlier steps filter out most brackets is key to performance.
|
||||
//
|
||||
// FIXME: implement this step.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/Bracket.h"
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
struct Bracket {
|
||||
using Index = unsigned;
|
||||
constexpr static Index None = -1;
|
||||
|
||||
enum BracketKind : char { Paren, Brace, Square } Kind;
|
||||
enum Direction : bool { Open, Close } Dir;
|
||||
unsigned Line;
|
||||
unsigned Indent;
|
||||
Token::Index Tok;
|
||||
Bracket::Index Pair = None;
|
||||
};
|
||||
|
||||
// Find brackets in the stream and convert to Bracket struct.
|
||||
std::vector<Bracket> findBrackets(const TokenStream &Stream) {
|
||||
std::vector<Bracket> Brackets;
|
||||
auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
|
||||
Bracket::Direction D) {
|
||||
Brackets.push_back(
|
||||
{K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
|
||||
};
|
||||
for (const auto &Tok : Stream.tokens()) {
|
||||
switch (Tok.Kind) {
|
||||
case clang::tok::l_paren:
|
||||
Add(Tok, Bracket::Paren, Bracket::Open);
|
||||
break;
|
||||
case clang::tok::r_paren:
|
||||
Add(Tok, Bracket::Paren, Bracket::Close);
|
||||
break;
|
||||
case clang::tok::l_brace:
|
||||
Add(Tok, Bracket::Brace, Bracket::Open);
|
||||
break;
|
||||
case clang::tok::r_brace:
|
||||
Add(Tok, Bracket::Brace, Bracket::Close);
|
||||
break;
|
||||
case clang::tok::l_square:
|
||||
Add(Tok, Bracket::Square, Bracket::Open);
|
||||
break;
|
||||
case clang::tok::r_square:
|
||||
Add(Tok, Bracket::Square, Bracket::Close);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Brackets;
|
||||
}
|
||||
|
||||
// Write the bracket pairings from Brackets back to Tokens.
|
||||
void applyPairings(ArrayRef<Bracket> Brackets, TokenStream &Tokens) {
|
||||
for (const auto &B : Brackets)
|
||||
Tokens.tokens()[B.Tok].Pair =
|
||||
(B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok;
|
||||
}
|
||||
|
||||
// Find perfect pairings (ignoring whitespace) via greedy algorithm.
|
||||
// This means two brackets are paired if they match and the brackets between
|
||||
// them nest perfectly, with no skipped or crossed brackets.
|
||||
void simplePairBrackets(MutableArrayRef<Bracket> Brackets) {
|
||||
std::vector<unsigned> Stack;
|
||||
for (unsigned I = 0; I < Brackets.size(); ++I) {
|
||||
if (Brackets[I].Dir == Bracket::Open) {
|
||||
Stack.push_back(I);
|
||||
} else if (!Stack.empty() &&
|
||||
Brackets[Stack.back()].Kind == Brackets[I].Kind) {
|
||||
Brackets[Stack.back()].Pair = I;
|
||||
Brackets[I].Pair = Stack.back();
|
||||
Stack.pop_back();
|
||||
} else {
|
||||
// Unpaired closer, no brackets on stack are part of a perfect sequence.
|
||||
Stack.clear();
|
||||
}
|
||||
}
|
||||
// Any remaining brackets on the stack stay unpaired.
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void pairBrackets(TokenStream &Stream) {
|
||||
auto Brackets = findBrackets(Stream);
|
||||
simplePairBrackets(Brackets);
|
||||
applyPairings(Brackets, Stream);
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
|
@ -1,6 +1,7 @@
|
|||
set(LLVM_LINK_COMPONENTS Support)
|
||||
|
||||
add_clang_library(clangPseudo
|
||||
Bracket.cpp
|
||||
DirectiveTree.cpp
|
||||
Forest.cpp
|
||||
GLR.cpp
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/Bracket.h"
|
||||
#include "clang-pseudo/DirectiveTree.h"
|
||||
#include "clang-pseudo/GLR.h"
|
||||
#include "clang-pseudo/Grammar.h"
|
||||
|
@ -89,6 +90,7 @@ int main(int argc, char *argv[]) {
|
|||
llvm::outs() << DirectiveStructure;
|
||||
|
||||
ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
|
||||
pairBrackets(*ParseableStream);
|
||||
}
|
||||
|
||||
if (Grammar.getNumOccurrences()) {
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
//===--- BracketTest.cpp -------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/Bracket.h"
|
||||
#include "clang-pseudo/Token.h"
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
#include "llvm/Testing/Support/Annotations.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
|
||||
// Return a version of Code with each paired bracket marked with ^.
|
||||
std::string decorate(llvm::StringRef Code, const TokenStream &Stream) {
|
||||
std::string Result;
|
||||
const char *Pos = Code.data();
|
||||
for (const Token &Tok : Stream.tokens()) {
|
||||
if (Tok.Pair == 0)
|
||||
continue;
|
||||
const char *NewPos = Tok.text().begin();
|
||||
assert(NewPos >= Code.begin() && NewPos < Code.end());
|
||||
Result.append(Pos, NewPos - Pos);
|
||||
Result.push_back('^');
|
||||
Pos = NewPos;
|
||||
}
|
||||
Result.append(Pos, Code.end() - Pos);
|
||||
return Result;
|
||||
}
|
||||
|
||||
// Checks that the brackets matched in Stream are those annotated in MarkedCode.
|
||||
void verifyMatchedSet(llvm::StringRef Code, llvm::StringRef MarkedCode,
|
||||
const TokenStream &Stream) {
|
||||
EXPECT_EQ(MarkedCode, decorate(Code, Stream));
|
||||
}
|
||||
|
||||
// Checks that paired brackets within the stream nest properly.
|
||||
void verifyNesting(const TokenStream &Stream) {
|
||||
std::vector<const Token *> Stack;
|
||||
for (const auto &Tok : Stream.tokens()) {
|
||||
if (Tok.Pair > 0)
|
||||
Stack.push_back(&Tok);
|
||||
else if (Tok.Pair < 0) {
|
||||
ASSERT_FALSE(Stack.empty()) << Tok;
|
||||
ASSERT_EQ(Stack.back(), Tok.pair())
|
||||
<< *Stack.back() << " != " << *Tok.pair() << " = pair of " << Tok;
|
||||
Stack.pop_back();
|
||||
}
|
||||
}
|
||||
ASSERT_THAT(Stack, testing::IsEmpty());
|
||||
}
|
||||
|
||||
// Checks that ( pairs with a ) on its right, etc.
|
||||
void verifyMatchKind(const TokenStream &Stream) {
|
||||
for (const auto &Tok : Stream.tokens()) {
|
||||
if (Tok.Pair == 0)
|
||||
continue;
|
||||
auto Want = [&]() -> std::pair<bool, tok::TokenKind> {
|
||||
switch (Tok.Kind) {
|
||||
case tok::l_paren:
|
||||
return {true, tok::r_paren};
|
||||
case tok::r_paren:
|
||||
return {false, tok::l_paren};
|
||||
case tok::l_brace:
|
||||
return {true, tok::r_brace};
|
||||
case tok::r_brace:
|
||||
return {false, tok::l_brace};
|
||||
case tok::l_square:
|
||||
return {true, tok::r_square};
|
||||
case tok::r_square:
|
||||
return {false, tok::l_square};
|
||||
default:
|
||||
ADD_FAILURE() << "Paired non-bracket " << Tok;
|
||||
return {false, tok::eof};
|
||||
}
|
||||
}();
|
||||
EXPECT_EQ(Tok.Pair > 0, Want.first) << Tok;
|
||||
EXPECT_EQ(Tok.pair()->Kind, Want.second) << Tok;
|
||||
}
|
||||
}
|
||||
|
||||
// Verifies an expected bracket pairing like:
|
||||
// ^( [ ^)
|
||||
// The input is annotated code, with the brackets expected to be matched marked.
|
||||
//
|
||||
// The input doesn't specify which bracket matches with which, but we verify:
|
||||
// - exactly the marked subset are paired
|
||||
// - ( is paired to a later ), etc
|
||||
// - brackets properly nest
|
||||
// This uniquely determines the bracket structure, so we indirectly verify it.
|
||||
// If particular tests should emphasize which brackets are paired, use comments.
|
||||
void verifyBrackets(llvm::StringRef MarkedCode) {
|
||||
SCOPED_TRACE(MarkedCode);
|
||||
llvm::Annotations A(MarkedCode);
|
||||
std::string Code = A.code().str();
|
||||
LangOptions LangOpts;
|
||||
auto Stream = lex(Code, LangOpts);
|
||||
pairBrackets(Stream);
|
||||
|
||||
verifyMatchedSet(Code, MarkedCode, Stream);
|
||||
verifyNesting(Stream);
|
||||
verifyMatchKind(Stream);
|
||||
}
|
||||
|
||||
TEST(Bracket, SimplePair) {
|
||||
verifyBrackets("^{ ^[ ^( ^) ^( ^) ^] ^}");
|
||||
verifyBrackets(") ^{ ^[ ^] ^} (");
|
||||
verifyBrackets("{ [ ( ] }"); // FIXME
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
|
@ -1,9 +1,11 @@
|
|||
set(LLVM_LINK_COMPONENTS
|
||||
Support
|
||||
TestingSupport
|
||||
)
|
||||
|
||||
add_custom_target(ClangPseudoUnitTests)
|
||||
add_unittest(ClangPseudoUnitTests ClangPseudoTests
|
||||
BracketTest.cpp
|
||||
DirectiveTreeTest.cpp
|
||||
ForestTest.cpp
|
||||
GLRTest.cpp
|
||||
|
|
Loading…
Reference in New Issue