[Syntax] Build SimpleDeclaration node that groups multiple declarators

Summary:
Also remove the temporary TopLevelDeclaration node and add
UnknownDeclaration to represent other unknown nodes.

See the follow-up change for building more top-level declarations.
Adding declarators is also pretty involved and will be done in another
follow-up patch.

Reviewers: gribozavr2

Reviewed By: gribozavr2

Subscribers: merge_guards_bot, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D70787
This commit is contained in:
Ilya Biryukov 2019-11-29 11:32:58 +01:00
parent a48b5e2474
commit e702bdb859
4 changed files with 269 additions and 77 deletions

View File

@ -37,7 +37,6 @@ namespace syntax {
enum class NodeKind : uint16_t {
Leaf,
TranslationUnit,
TopLevelDeclaration,
// Expressions
UnknownExpression,
@ -57,7 +56,11 @@ enum class NodeKind : uint16_t {
ReturnStatement,
RangeBasedForStatement,
ExpressionStatement,
CompoundStatement
CompoundStatement,
// Declarations
UnknownDeclaration,
SimpleDeclaration,
};
/// For debugging purposes.
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NodeKind K);
@ -102,20 +105,6 @@ public:
}
};
/// FIXME: this node is temporary and will be replaced with nodes for various
/// 'declarations' and 'declarators' from the C/C++ grammar
///
/// Represents any top-level declaration. Only there to give the syntax tree a
/// bit of structure until we implement syntax nodes for declarations and
/// declarators.
class TopLevelDeclaration final : public Tree {
public:
TopLevelDeclaration() : Tree(NodeKind::TopLevelDeclaration) {}
static bool classof(const Node *N) {
return N->kind() == NodeKind::TopLevelDeclaration;
}
};
/// A base class for all expressions. Note that expressions are not statements,
/// even though they are in clang.
class Expression : public Tree {
@ -313,6 +302,38 @@ public:
syntax::Leaf *rbrace();
};
/// A declaration that can appear at the top-level. Note that this does *not*
/// correspond 1-to-1 to clang::Decl. Syntax trees distinguish between top-level
/// declarations (e.g. namespace definitions) and declarators (e.g. variables,
/// typedefs, etc.). Declarators are stored inside SimpleDeclaration.
class Declaration : public Tree {
public:
Declaration(NodeKind K) : Tree(K) {}
static bool classof(const Node *N) {
return NodeKind::UnknownDeclaration <= N->kind() &&
N->kind() <= NodeKind::SimpleDeclaration;
}
};
/// Declaration of an unknown kind, e.g. not yet supported in syntax trees.
class UnknownDeclaration final : public Declaration {
public:
UnknownDeclaration() : Declaration(NodeKind::UnknownDeclaration) {}
static bool classof(const Node *N) {
return N->kind() == NodeKind::UnknownDeclaration;
}
};
/// Groups multiple declarators (e.g. variables, typedefs, etc.) together. All
/// grouped declarators share the same declaration specifiers (e.g. 'int' or
/// 'typedef').
class SimpleDeclaration final : public Declaration {
public:
SimpleDeclaration() : Declaration(NodeKind::SimpleDeclaration) {}
static bool classof(const Node *N) {
return N->kind() == NodeKind::SimpleDeclaration;
}
};
} // namespace syntax
} // namespace clang
#endif

View File

@ -6,6 +6,8 @@
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/BuildTree.h"
#include "clang/AST/Decl.h"
#include "clang/AST/DeclBase.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/AST/Stmt.h"
#include "clang/Basic/LLVM.h"
@ -56,6 +58,14 @@ public:
/// Range.
void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New);
/// Must be called with the range of each `DeclaratorDecl`. Ensures the
/// corresponding declarator nodes are covered by `SimpleDeclaration`.
void noticeDeclaratorRange(llvm::ArrayRef<syntax::Token> Range);
/// Notifies that we should not consume trailing semicolon when computing
/// token range of \p D.
void noticeDeclaratorWithoutSemicolon(Decl *D);
/// Mark the \p Child node with a corresponding \p Role. All marked children
/// should be consumed by foldNode.
/// (!) when called on expressions (clang::Expr is derived from clang::Stmt),
@ -94,7 +104,14 @@ public:
return llvm::makeArrayRef(findToken(First), std::next(findToken(Last)));
}
llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const {
return getRange(D->getBeginLoc(), D->getEndLoc());
auto Tokens = getRange(D->getBeginLoc(), D->getEndLoc());
if (llvm::isa<NamespaceDecl>(D))
return Tokens;
if (DeclsWithoutSemicolons.count(D))
return Tokens;
// FIXME: do not consume trailing semicolon on function definitions.
// Most declarations own a semicolon in syntax trees, but not in clang AST.
return withTrailingSemicolon(Tokens);
}
llvm::ArrayRef<syntax::Token> getExprRange(const Expr *E) const {
return getRange(E->getBeginLoc(), E->getEndLoc());
@ -108,14 +125,22 @@ public:
// Some statements miss a trailing semicolon, e.g. 'return', 'continue' and
// all statements that end with those. Consume this semicolon here.
//
// (!) statements never consume 'eof', so looking at the next token is ok.
if (Tokens.back().kind() == tok::semi)
return Tokens;
return withTrailingSemicolon(Tokens);
}
private:
llvm::ArrayRef<syntax::Token>
withTrailingSemicolon(llvm::ArrayRef<syntax::Token> Tokens) const {
assert(!Tokens.empty());
assert(Tokens.back().kind() != tok::eof);
// (!) we never consume 'eof', so looking at the next token is ok.
if (Tokens.back().kind() != tok::semi && Tokens.end()->kind() == tok::semi)
return llvm::makeArrayRef(Tokens.begin(), Tokens.end() + 1);
return Tokens;
}
private:
/// Finds a token starting at \p L. The token must exist.
const syntax::Token *findToken(SourceLocation L) const;
@ -136,6 +161,8 @@ private:
{&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}});
}
~Forest() { assert(DelayedFolds.empty()); }
void assignRole(llvm::ArrayRef<syntax::Token> Range,
syntax::NodeRole Role) {
assert(!Range.empty());
@ -148,30 +175,46 @@ private:
It->second.Role = Role;
}
/// Add \p Node to the forest and fill its children nodes based on the \p
/// NodeRange.
void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens,
/// Add \p Node to the forest and attach child nodes based on \p Tokens.
void foldChildren(llvm::ArrayRef<syntax::Token> Tokens,
syntax::Tree *Node) {
assert(!NodeTokens.empty());
assert(Node->firstChild() == nullptr && "node already has children");
// Execute delayed folds inside `Tokens`.
auto BeginExecuted = DelayedFolds.lower_bound(Tokens.begin());
auto It = BeginExecuted;
for (; It != DelayedFolds.end() && It->second.End <= Tokens.end(); ++It)
foldChildrenEager(llvm::makeArrayRef(It->first, It->second.End),
It->second.Node);
DelayedFolds.erase(BeginExecuted, It);
auto *FirstToken = NodeTokens.begin();
auto BeginChildren = Trees.lower_bound(FirstToken);
assert(BeginChildren != Trees.end() &&
BeginChildren->first == FirstToken &&
"fold crosses boundaries of existing subtrees");
auto EndChildren = Trees.lower_bound(NodeTokens.end());
assert((EndChildren == Trees.end() ||
EndChildren->first == NodeTokens.end()) &&
"fold crosses boundaries of existing subtrees");
// Attach children to `Node`.
foldChildrenEager(Tokens, Node);
}
// (!) we need to go in reverse order, because we can only prepend.
for (auto It = EndChildren; It != BeginChildren; --It)
Node->prependChildLowLevel(std::prev(It)->second.Node,
std::prev(It)->second.Role);
/// Schedule a call to `foldChildren` that will only be executed when
/// containing node is folded. The range of delayed nodes can be extended by
/// calling `extendDelayedFold`. Only one delayed node for each starting
/// token is allowed.
void foldChildrenDelayed(llvm::ArrayRef<syntax::Token> Tokens,
syntax::Tree *Node) {
assert(!Tokens.empty());
bool Inserted =
DelayedFolds.insert({Tokens.begin(), DelayedFold{Tokens.end(), Node}})
.second;
(void)Inserted;
assert(Inserted && "Multiple delayed folds start at the same token");
}
Trees.erase(BeginChildren, EndChildren);
Trees.insert({FirstToken, NodeAndRole(Node)});
/// If there a delayed fold, starting at `ExtendedRange.begin()`, extends
/// its endpoint to `ExtendedRange.end()` and returns true.
/// Otherwise, returns false.
bool extendDelayedFold(llvm::ArrayRef<syntax::Token> ExtendedRange) {
assert(!ExtendedRange.empty());
auto It = DelayedFolds.find(ExtendedRange.data());
if (It == DelayedFolds.end())
return false;
assert(It->second.End <= ExtendedRange.end());
It->second.End = ExtendedRange.end();
return true;
}
// EXPECTS: all tokens were consumed and are owned by a single root node.
@ -199,6 +242,30 @@ private:
}
private:
/// Implementation detail of `foldChildren`, does acutal folding ignoring
/// delayed folds.
void foldChildrenEager(llvm::ArrayRef<syntax::Token> Tokens,
syntax::Tree *Node) {
assert(Node->firstChild() == nullptr && "node already has children");
auto *FirstToken = Tokens.begin();
auto BeginChildren = Trees.lower_bound(FirstToken);
assert((BeginChildren == Trees.end() ||
BeginChildren->first == FirstToken) &&
"fold crosses boundaries of existing subtrees");
auto EndChildren = Trees.lower_bound(Tokens.end());
assert(
(EndChildren == Trees.end() || EndChildren->first == Tokens.end()) &&
"fold crosses boundaries of existing subtrees");
// (!) we need to go in reverse order, because we can only prepend.
for (auto It = EndChildren; It != BeginChildren; --It)
Node->prependChildLowLevel(std::prev(It)->second.Node,
std::prev(It)->second.Role);
Trees.erase(BeginChildren, EndChildren);
Trees.insert({FirstToken, NodeAndRole(Node)});
}
/// A with a role that should be assigned to it when adding to a parent.
struct NodeAndRole {
explicit NodeAndRole(syntax::Node *Node)
@ -214,6 +281,13 @@ private:
/// FIXME: storing the end tokens is redundant.
/// FIXME: the key of a map is redundant, it is also stored in NodeForRange.
std::map<const syntax::Token *, NodeAndRole> Trees;
/// See documentation of `foldChildrenDelayed` for details.
struct DelayedFold {
const syntax::Token *End = nullptr;
syntax::Tree *Node = nullptr;
};
std::map<const syntax::Token *, DelayedFold> DelayedFolds;
};
/// For debugging purposes.
@ -221,6 +295,7 @@ private:
syntax::Arena &Arena;
Forest Pending;
llvm::DenseSet<Decl*> DeclsWithoutSemicolons;
};
namespace {
@ -231,20 +306,30 @@ public:
bool shouldTraversePostOrder() const { return true; }
bool TraverseDecl(Decl *D) {
if (!D || isa<TranslationUnitDecl>(D))
return RecursiveASTVisitor::TraverseDecl(D);
if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext()))
return true; // Only build top-level decls for now, do not recurse.
return RecursiveASTVisitor::TraverseDecl(D);
bool WalkUpFromDeclaratorDecl(DeclaratorDecl *D) {
// Ensure declarators are covered by SimpleDeclaration.
Builder.noticeDeclaratorRange(Builder.getRange(D));
// FIXME: build nodes for the declarator too.
return true;
}
bool WalkUpFromTypedefNameDecl(TypedefNameDecl *D) {
// Also a declarator.
Builder.noticeDeclaratorRange(Builder.getRange(D));
// FIXME: build nodes for the declarator too.
return true;
}
bool VisitDecl(Decl *D) {
assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) &&
"expected a top-level decl");
assert(!D->isImplicit());
Builder.foldNode(Builder.getRange(D),
new (allocator()) syntax::TopLevelDeclaration());
new (allocator()) syntax::UnknownDeclaration());
return true;
}
bool WalkUpFromTagDecl(TagDecl *C) {
// Avoid building UnknownDeclaration here, syntatically 'struct X {}' and
// similar are part of declaration specifiers and do not introduce a new
// top-level declaration.
return true;
}
@ -291,7 +376,11 @@ public:
}
bool TraverseStmt(Stmt *S) {
if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
if (auto *DS = llvm::dyn_cast_or_null<DeclStmt>(S)) {
// We want to consume the semicolon, make sure SimpleDeclaration does not.
for (auto *D : DS->decls())
Builder.noticeDeclaratorWithoutSemicolon(D);
} else if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
// (!) do not recurse into subexpressions.
// we do not have syntax trees for expressions yet, so we only want to see
// the first top-level expression.
@ -429,6 +518,18 @@ void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range,
Pending.foldChildren(Range, New);
}
void syntax::TreeBuilder::noticeDeclaratorRange(
llvm::ArrayRef<syntax::Token> Range) {
if (Pending.extendDelayedFold(Range))
return;
Pending.foldChildrenDelayed(Range,
new (allocator()) syntax::SimpleDeclaration);
}
void syntax::TreeBuilder::noticeDeclaratorWithoutSemicolon(Decl *D) {
DeclsWithoutSemicolons.insert(D);
}
void syntax::TreeBuilder::markChildToken(SourceLocation Loc, NodeRole Role) {
if (Loc.isInvalid())
return;

View File

@ -16,8 +16,6 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
return OS << "Leaf";
case NodeKind::TranslationUnit:
return OS << "TranslationUnit";
case NodeKind::TopLevelDeclaration:
return OS << "TopLevelDeclaration";
case NodeKind::UnknownExpression:
return OS << "UnknownExpression";
case NodeKind::UnknownStatement:
@ -50,6 +48,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
return OS << "ExpressionStatement";
case NodeKind::CompoundStatement:
return OS << "CompoundStatement";
case NodeKind::UnknownDeclaration:
return OS << "UnknownDeclaration";
case NodeKind::SimpleDeclaration:
return OS << "SimpleDeclaration";
}
llvm_unreachable("unknown node kind");
}

View File

@ -130,7 +130,7 @@ void foo() {}
)cpp",
R"txt(
*: TranslationUnit
|-TopLevelDeclaration
|-SimpleDeclaration
| |-int
| |-main
| |-(
@ -138,7 +138,7 @@ void foo() {}
| `-CompoundStatement
| |-{
| `-}
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-foo
|-(
@ -157,7 +157,7 @@ int main() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-int
|-main
|-(
@ -202,7 +202,7 @@ void test() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -224,7 +224,7 @@ void test() {
{"void test() { int a = 10; }",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -232,16 +232,18 @@ void test() {
`-CompoundStatement
|-{
|-DeclarationStatement
| |-int
| |-a
| |-=
| |-10
| |-SimpleDeclaration
| | |-int
| | |-a
| | |-=
| | `-UnknownExpression
| | `-10
| `-;
`-}
)txt"},
{"void test() { ; }", R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -263,7 +265,7 @@ void test() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -299,7 +301,7 @@ void test() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -329,7 +331,7 @@ int test() { return 1; }
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-int
|-test
|-(
@ -352,7 +354,7 @@ void test() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -360,18 +362,21 @@ void test() {
`-CompoundStatement
|-{
|-DeclarationStatement
| |-int
| |-a
| |-[
| |-3
| |-]
| |-SimpleDeclaration
| | |-int
| | |-a
| | |-[
| | |-UnknownExpression
| | | `-3
| | `-]
| `-;
|-RangeBasedForStatement
| |-for
| |-(
| |-int
| |-x
| |-:
| |-SimpleDeclaration
| | |-int
| | |-x
| | `-:
| |-UnknownExpression
| | `-a
| |-)
@ -384,7 +389,7 @@ void test() {
// counterpart.
{"void main() { foo: return 100; }", R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-main
|-(
@ -411,7 +416,7 @@ void test() {
)cpp",
R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
|-void
|-test
|-(
@ -444,7 +449,70 @@ void test() {
| | `-)
| `-;
`-}
)txt"}};
)txt"},
// Multiple declarators group into a single SimpleDeclaration.
{R"cpp(
int *a, b;
)cpp",
R"txt(
*: TranslationUnit
`-SimpleDeclaration
|-int
|-*
|-a
|-,
|-b
`-;
)txt"},
{R"cpp(
typedef int *a, b;
)cpp",
R"txt(
*: TranslationUnit
`-SimpleDeclaration
|-typedef
|-int
|-*
|-a
|-,
|-b
`-;
)txt"},
// Multiple declarators inside a statement.
{R"cpp(
void foo() {
int *a, b;
typedef int *ta, tb;
}
)cpp",
R"txt(
*: TranslationUnit
`-SimpleDeclaration
|-void
|-foo
|-(
|-)
`-CompoundStatement
|-{
|-DeclarationStatement
| |-SimpleDeclaration
| | |-int
| | |-*
| | |-a
| | |-,
| | `-b
| `-;
|-DeclarationStatement
| |-SimpleDeclaration
| | |-typedef
| | |-int
| | |-*
| | |-ta
| | |-,
| | `-tb
| `-;
`-}
)txt"}};
for (const auto &T : Cases) {
auto *Root = buildTree(T.first);