forked from OSchip/llvm-project
[pseudo] Implement the GLR parsing algorithm.
This patch implements a standard GLR parsing algorithm, the core piece of the pseudoparser. - it parses preprocessed C++ code, currently it supports correct code only and parse them as a translation-unit; - it produces a forest which stores all possible trees in an efficient manner (only a single node being build for per (SymbolID, Token Range)); no disambiguation yet; Differential Revision: https://reviews.llvm.org/D121150
This commit is contained in:
parent
ca0b416659
commit
eac22d0754
|
@ -157,6 +157,10 @@ public:
|
|||
return create(ForestNode::Opaque, SID, Start, 0, {});
|
||||
}
|
||||
|
||||
ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
|
||||
return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
|
||||
}
|
||||
|
||||
size_t nodeCount() const { return NodeCount; }
|
||||
size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
|
||||
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This implements a standard Generalized LR (GLR) parsing algorithm.
|
||||
//
|
||||
// The GLR parser behaves as a normal LR parser until it encounters a conflict.
|
||||
// To handle a conflict (where there are multiple actions could perform), the
|
||||
// parser will simulate nondeterminism by doing a breadth-first search
|
||||
// over all the possibilities.
|
||||
//
|
||||
// Basic mechanisims of the GLR parser:
|
||||
// - A number of processes are operated in parallel.
|
||||
// - Each process has its own parsing stack and behaves as a standard
|
||||
// determinism LR parser.
|
||||
// - When a process encounters a conflict, it will be fork (one for each
|
||||
// avaiable action).
|
||||
// - When a process encounters an error, it is abandoned.
|
||||
// - All process are synchronized by the lookahead token: they perfrom shift
|
||||
// action at the same time, which means some processes need wait until other
|
||||
// processes have performed all reduce actions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef CLANG_PSEUDO_GLR_H
|
||||
#define CLANG_PSEUDO_GLR_H
|
||||
|
||||
#include "clang-pseudo/Forest.h"
|
||||
#include "clang-pseudo/Grammar.h"
|
||||
#include "clang-pseudo/LRTable.h"
|
||||
#include "llvm/Support/Allocator.h"
|
||||
#include <vector>
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
|
||||
// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
|
||||
// parser.
|
||||
//
|
||||
// Each node stores a parse state, the last parsed ForestNode, and the parent
|
||||
// node. There may be several heads (top of stack), and the parser operates by:
|
||||
// - shift: pushing terminal symbols on top of the stack
|
||||
// - reduce: replace N symbols on top of the stack with one nonterminal
|
||||
//
|
||||
// The structure is a DAG rather than a linear stack:
|
||||
// - GLR allows multiple actions (conflicts) on the same head, producing forks
|
||||
// where several nodes have the same parent
|
||||
// - The parser merges nodes with the same (state, ForestNode), producing joins
|
||||
// where one node has multiple parents
|
||||
//
|
||||
// The parser is responsible for creating nodes and keeping track of the set of
|
||||
// heads. The GSS class is mostly an arena for them.
|
||||
struct GSS {
|
||||
// A node represents a partial parse of the input up to some point.
|
||||
//
|
||||
// It is the equivalent of a frame in an LR parse stack.
|
||||
// Like such a frame, it has an LR parse state and a syntax-tree node
|
||||
// representing the last parsed symbol (a ForestNode in our case).
|
||||
// Unlike a regular LR stack frame, it may have multiple parents.
|
||||
//
|
||||
// Nodes are not exactly pushed and popped on the stack: pushing is just
|
||||
// allocating a new head node with a parent pointer to the old head. Popping
|
||||
// is just forgetting about a node and remembering its parent instead.
|
||||
struct alignas(struct Node *) Node {
|
||||
// LR state describing how parsing should continue from this head.
|
||||
LRTable::StateID State;
|
||||
// Number of the parents of this node.
|
||||
// The parents hold previous parsed symbols, and may resume control after
|
||||
// this node is reduced.
|
||||
unsigned ParentCount;
|
||||
// The parse node for the last parsed symbol.
|
||||
// This symbol appears on the left of the dot in the parse state's items.
|
||||
// (In the literature, the node is attached to the *edge* to the parent).
|
||||
const ForestNode *Payload = nullptr;
|
||||
|
||||
// FIXME: Most nodes live a fairly short time, and are simply discarded.
|
||||
// Is it worth refcounting them (we have empty padding) and returning to a
|
||||
// freelist, to keep the working set small?
|
||||
|
||||
llvm::ArrayRef<const Node *> parents() const {
|
||||
return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1),
|
||||
ParentCount);
|
||||
};
|
||||
// Parents are stored as a trailing array of Node*.
|
||||
};
|
||||
|
||||
// Allocates a new node in the graph.
|
||||
const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
|
||||
llvm::ArrayRef<const Node *> Parents) {
|
||||
++NodeCount;
|
||||
Node *Result = new (Arena.Allocate(
|
||||
sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node)))
|
||||
Node({State, static_cast<unsigned>(Parents.size())});
|
||||
Result->Payload = Symbol;
|
||||
if (!Parents.empty())
|
||||
llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
|
||||
return Result;
|
||||
}
|
||||
|
||||
size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
|
||||
size_t nodeCount() const { return NodeCount; }
|
||||
|
||||
private:
|
||||
llvm::BumpPtrAllocator Arena;
|
||||
unsigned NodeCount = 0;
|
||||
};
|
||||
|
||||
// Parameters for the GLR parsing.
|
||||
struct ParseParams {
|
||||
// The grammar of the language we're going to parse.
|
||||
const Grammar &G;
|
||||
// The LR table which GLR uses to parse the input, should correspond to the
|
||||
// Grammar G.
|
||||
const LRTable &Table;
|
||||
|
||||
// Arena for data structure used by the GLR algorithm.
|
||||
ForestArena &Forest; // Storage for the output forest.
|
||||
GSS &GSS; // Storage for parsing stacks.
|
||||
};
|
||||
// Parse the given token stream with the GLR algorithm, and return a forest node
|
||||
// of the start symbol.
|
||||
//
|
||||
// If the parsing fails, we model it as an opaque node in the forest.
|
||||
//
|
||||
// FIXME: add support for variant start symbols.
|
||||
const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params);
|
||||
|
||||
// An active stack head can have multiple available actions (reduce/reduce
|
||||
// actions, reduce/shift actions).
|
||||
// A step is any one action applied to any one stack head.
|
||||
struct ParseStep {
|
||||
// A specific stack head.
|
||||
const GSS::Node *Head = nullptr;
|
||||
// An action associated with the head.
|
||||
LRTable::Action Action = LRTable::Action::sentinel();
|
||||
};
|
||||
// A callback is invoked whenever a new GSS head is created during the GLR
|
||||
// parsing process (glrShift, or glrReduce).
|
||||
using NewHeadCallback = std::function<void(const GSS::Node *)>;
|
||||
// Apply all PendingShift actions on a given GSS state, newly-created heads are
|
||||
// passed to the callback.
|
||||
//
|
||||
// When this function returns, PendingShift is empty.
|
||||
//
|
||||
// Exposed for testing only.
|
||||
void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NextTok,
|
||||
const ParseParams &Params, NewHeadCallback NewHeadCB);
|
||||
// Applies PendingReduce actions, until no more reduce actions are available.
|
||||
//
|
||||
// When this function returns, PendingReduce is empty. Calls to NewHeadCB may
|
||||
// add elements to PendingReduce
|
||||
//
|
||||
// Exposed for testing only.
|
||||
void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
|
||||
NewHeadCallback NewHeadCB);
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
||||
|
||||
#endif // CLANG_PSEUDO_GLR_H
|
|
@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Support)
|
|||
add_clang_library(clangPseudo
|
||||
DirectiveTree.cpp
|
||||
Forest.cpp
|
||||
GLR.cpp
|
||||
Grammar.cpp
|
||||
GrammarBNF.cpp
|
||||
Lex.cpp
|
||||
|
|
|
@ -0,0 +1,369 @@
|
|||
//===--- GLR.cpp -----------------------------------------------*- C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/GLR.h"
|
||||
#include "clang-pseudo/Grammar.h"
|
||||
#include "clang-pseudo/LRTable.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
|
||||
#define DEBUG_TYPE "GLR.cpp"
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
|
||||
using StateID = LRTable::StateID;
|
||||
|
||||
const ForestNode &glrParse(const TokenStream &Tokens,
|
||||
const ParseParams &Params) {
|
||||
llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Tokens);
|
||||
auto &G = Params.G;
|
||||
auto &GSS = Params.GSS;
|
||||
|
||||
// Lists of active shift, reduce, accept actions.
|
||||
std::vector<ParseStep> PendingShift, PendingReduce, PendingAccept;
|
||||
auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) {
|
||||
for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) {
|
||||
switch (Action.kind()) {
|
||||
case LRTable::Action::Shift:
|
||||
PendingShift.push_back({Head, Action});
|
||||
break;
|
||||
case LRTable::Action::Reduce:
|
||||
PendingReduce.push_back({Head, Action});
|
||||
break;
|
||||
case LRTable::Action::Accept:
|
||||
PendingAccept.push_back({Head, Action});
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("unexpected action kind!");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<const GSS::Node *> NewHeads = {
|
||||
GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})};
|
||||
for (const ForestNode &Terminal : Terminals) {
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
|
||||
G.symbolName(Terminal.symbol()),
|
||||
Terminal.symbol()));
|
||||
for (const auto *Head : NewHeads)
|
||||
AddSteps(Head, Terminal.symbol());
|
||||
NewHeads.clear();
|
||||
glrReduce(PendingReduce, Params,
|
||||
[&](const GSS::Node * NewHead) {
|
||||
// A reduce will enable more steps.
|
||||
AddSteps(NewHead, Terminal.symbol());
|
||||
});
|
||||
|
||||
glrShift(PendingShift, Terminal, Params,
|
||||
[&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); });
|
||||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n"));
|
||||
for (const auto *Heads : NewHeads)
|
||||
AddSteps(Heads, tokenSymbol(tok::eof));
|
||||
glrReduce(PendingReduce, Params,
|
||||
[&](const GSS::Node * NewHead) {
|
||||
// A reduce will enable more steps.
|
||||
AddSteps(NewHead, tokenSymbol(tok::eof));
|
||||
});
|
||||
|
||||
if (!PendingAccept.empty()) {
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted result:\n",
|
||||
PendingAccept.size()));
|
||||
for (const auto &Accept : PendingAccept)
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< " - " << G.symbolName(Accept.Head->Payload->symbol())
|
||||
<< "\n");
|
||||
assert(PendingAccept.size() == 1);
|
||||
return *PendingAccept.front().Head->Payload;
|
||||
}
|
||||
// We failed to parse the input, returning an opaque forest node for recovery.
|
||||
auto RulesForStart = G.rulesFor(G.startSymbol());
|
||||
// FIXME: support multiple start symbols.
|
||||
assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 &&
|
||||
"start symbol _ must have exactly one rule");
|
||||
return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0);
|
||||
}
|
||||
|
||||
// Apply all pending shift actions.
|
||||
// In theory, LR parsing doesn't have shift/shift conflicts on a single head.
|
||||
// But we may have multiple active heads, and each head has a shift action.
|
||||
//
|
||||
// We merge the stack -- if multiple heads will reach the same state after
|
||||
// shifting a token, we shift only once by combining these heads.
|
||||
//
|
||||
// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
|
||||
// 0---1---2
|
||||
// └---3
|
||||
// After the shift action, the GSS is:
|
||||
// 0---1---2---4
|
||||
// └---3---┘
|
||||
void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NewTok,
|
||||
const ParseParams &Params, NewHeadCallback NewHeadCB) {
|
||||
assert(NewTok.kind() == ForestNode::Terminal);
|
||||
assert(llvm::all_of(PendingShift,
|
||||
[](const ParseStep &Step) {
|
||||
return Step.Action.kind() == LRTable::Action::Shift;
|
||||
}) &&
|
||||
"Pending shift actions must be shift actions");
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n",
|
||||
Params.G.symbolName(NewTok.symbol()),
|
||||
PendingShift.size()));
|
||||
|
||||
// We group pending shifts by their target state so we can merge them.
|
||||
llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) {
|
||||
return L.Action.getShiftState() < R.Action.getShiftState();
|
||||
});
|
||||
auto Rest = llvm::makeArrayRef(PendingShift);
|
||||
llvm::SmallVector<const GSS::Node *> Parents;
|
||||
while (!Rest.empty()) {
|
||||
// Collect the batch of PendingShift that have compatible shift states.
|
||||
// Their heads become TempParents, the parents of the new GSS node.
|
||||
StateID NextState = Rest.front().Action.getShiftState();
|
||||
|
||||
Parents.clear();
|
||||
for (const auto &Base : Rest) {
|
||||
if (Base.Action.getShiftState() != NextState)
|
||||
break;
|
||||
Parents.push_back(Base.Head);
|
||||
}
|
||||
Rest = Rest.drop_front(Parents.size());
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n",
|
||||
NextState, Parents.size()));
|
||||
NewHeadCB(Params.GSS.addNode(NextState, &NewTok, Parents));
|
||||
}
|
||||
PendingShift.clear();
|
||||
}
|
||||
|
||||
namespace {
|
||||
// A KeyedQueue yields pairs of keys and values in order of the keys.
|
||||
template <typename Key, typename Value>
|
||||
using KeyedQueue =
|
||||
std::priority_queue<std::pair<Key, Value>,
|
||||
std::vector<std::pair<Key, Value>>, llvm::less_first>;
|
||||
|
||||
template <typename T> void sortAndUnique(std::vector<T> &Vec) {
|
||||
llvm::sort(Vec);
|
||||
Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Perform reduces until no more are possible.
|
||||
//
|
||||
// Generally this means walking up from the heads gathering ForestNodes that
|
||||
// will match the RHS of the rule we're reducing into a sequence ForestNode,
|
||||
// and ending up at a base node.
|
||||
// Then we push a new GSS node onto that base, taking care to:
|
||||
// - pack alternative sequence ForestNodes into an ambiguous ForestNode.
|
||||
// - use the same GSS node for multiple heads if the parse state matches.
|
||||
//
|
||||
// Examples of reduction:
|
||||
// Before (simple):
|
||||
// 0--1(expr)--2(semi)
|
||||
// After reducing 2 by `stmt := expr semi`:
|
||||
// 0--3(stmt) // 3 is goto(0, stmt)
|
||||
//
|
||||
// Before (splitting due to R/R conflict):
|
||||
// 0--1(IDENTIFIER)
|
||||
// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
|
||||
// 0--2(class-name) // 2 is goto(0, class-name)
|
||||
// └--3(enum-name) // 3 is goto(0, enum-name)
|
||||
//
|
||||
// Before (splitting due to multiple bases):
|
||||
// 0--2(class-name)--4(STAR)
|
||||
// └--3(enum-name)---┘
|
||||
// After reducing 4 by `ptr-operator := STAR`:
|
||||
// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
|
||||
// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
|
||||
//
|
||||
// Before (joining due to same goto state, multiple bases):
|
||||
// 0--1(cv-qualifier)--3(class-name)
|
||||
// └--2(cv-qualifier)--4(enum-name)
|
||||
// After reducing 3 by `type-name := class-name` and
|
||||
// 4 by `type-name := enum-name`:
|
||||
// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
|
||||
// └--2(cv-qualifier)--┘ // goto(2, type-name)
|
||||
//
|
||||
// Before (joining due to same goto state, the same base):
|
||||
// 0--1(class-name)--3(STAR)
|
||||
// └--2(enum-name)--4(STAR)
|
||||
// After reducing 3 by `pointer := class-name STAR` and
|
||||
// 2 by`enum-name := class-name STAR`:
|
||||
// 0--5(pointer) // 5 is goto(0, pointer)
|
||||
void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
|
||||
NewHeadCallback NewHeadCB) {
|
||||
// There are two interacting complications:
|
||||
// 1. Performing one reduce can unlock new reduces on the newly-created head.
|
||||
// 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
|
||||
// This means we must have unlocked all the reduces that contribute to it.
|
||||
// 2b. Similarly, the new GSS nodes must be complete (have all parents).
|
||||
//
|
||||
// We define a "family" of reduces as those that produce the same symbol and
|
||||
// cover the same range of tokens. These are exactly the set of reductions
|
||||
// whose sequence nodes would be covered by the same ambiguous node.
|
||||
// We wish to process a whole family at a time (to satisfy complication 2),
|
||||
// and can address complication 1 by carefully ordering the families:
|
||||
// - Process families covering fewer tokens first.
|
||||
// A reduce can't depend on a longer reduce!
|
||||
// - For equal token ranges: if S := T, process T families before S families.
|
||||
// Parsing T can't depend on an equal-length S, as the grammar is acyclic.
|
||||
//
|
||||
// This isn't quite enough: we don't know the token length of the reduction
|
||||
// until we walk up the stack to perform the pop.
|
||||
// So we perform the pop part upfront, and place the push specification on
|
||||
// priority queues such that we can retrieve a family at a time.
|
||||
|
||||
// A reduction family is characterized by its token range and symbol produced.
|
||||
// It is used as a key in the priority queues to group pushes by family.
|
||||
struct Family {
|
||||
// The start of the token range of the reduce.
|
||||
Token::Index Start;
|
||||
SymbolID Symbol;
|
||||
// Rule must produce Symbol and can otherwise be arbitrary.
|
||||
// RuleIDs have the topological order based on the acyclic grammar.
|
||||
// FIXME: should SymbolIDs be so ordered instead?
|
||||
RuleID Rule;
|
||||
|
||||
bool operator==(const Family &Other) const {
|
||||
return Start == Other.Start && Symbol == Other.Symbol;
|
||||
}
|
||||
// The larger Family is the one that should be processed first.
|
||||
bool operator<(const Family &Other) const {
|
||||
if (Start != Other.Start)
|
||||
return Start < Other.Start;
|
||||
if (Symbol != Other.Symbol)
|
||||
return Rule > Other.Rule;
|
||||
assert(*this == Other);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// The base nodes are the heads after popping the GSS nodes we are reducing.
|
||||
// We don't care which rule yielded each base. If Family.Symbol is S, the
|
||||
// base includes an item X := ... • S ... and since the grammar is
|
||||
// context-free, *all* parses of S are valid here.
|
||||
// FIXME: reuse the queues across calls instead of reallocating.
|
||||
KeyedQueue<Family, const GSS::Node *> Bases;
|
||||
|
||||
// A sequence is the ForestNode payloads of the GSS nodes we are reducing.
|
||||
// These are the RHS of the rule, the RuleID is stored in the Family.
|
||||
// They specify a sequence ForestNode we may build (but we dedup first).
|
||||
using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
|
||||
KeyedQueue<Family, Sequence> Sequences;
|
||||
|
||||
Sequence TempSequence;
|
||||
// Pop walks up the parent chain(s) for a reduction from Head by to Rule.
|
||||
// Once we reach the end, record the bases and sequences.
|
||||
auto Pop = [&](const GSS::Node *Head, RuleID RID) {
|
||||
LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.G.dumpRule(RID) << "\n");
|
||||
const auto &Rule = Params.G.lookupRule(RID);
|
||||
Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
|
||||
TempSequence.resize_for_overwrite(Rule.Size);
|
||||
auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
|
||||
if (I == Rule.Size) {
|
||||
F.Start = TempSequence.front()->startTokenIndex();
|
||||
Bases.emplace(F, N);
|
||||
LLVM_DEBUG(llvm::dbgs() << " --> base at S" << N->State << "\n");
|
||||
Sequences.emplace(F, TempSequence);
|
||||
return;
|
||||
}
|
||||
TempSequence[Rule.Size - 1 - I] = N->Payload;
|
||||
for (const GSS::Node *Parent : N->parents())
|
||||
DFS(Parent, I + 1, DFS);
|
||||
};
|
||||
DFS(Head, 0, DFS);
|
||||
};
|
||||
auto PopPending = [&] {
|
||||
for (const ParseStep &Pending : PendingReduce)
|
||||
Pop(Pending.Head, Pending.Action.getReduceRule());
|
||||
PendingReduce.clear();
|
||||
};
|
||||
|
||||
std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
|
||||
std::vector<std::pair<RuleID, Sequence>> FamilySequences;
|
||||
|
||||
std::vector<const GSS::Node *> TempGSSNodes;
|
||||
std::vector<const ForestNode *> TempForestNodes;
|
||||
|
||||
// Main reduction loop:
|
||||
// - pop as much as we can
|
||||
// - process one family at a time, forming a forest node
|
||||
// - produces new GSS heads which may enable more pops
|
||||
PopPending();
|
||||
while (!Bases.empty()) {
|
||||
// We should always have bases and sequences for the same families.
|
||||
Family F = Bases.top().first;
|
||||
assert(!Sequences.empty());
|
||||
assert(Sequences.top().first == F);
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << " Push " << Params.G.symbolName(F.Symbol)
|
||||
<< " from token " << F.Start << "\n");
|
||||
|
||||
// Grab the sequences for this family.
|
||||
FamilySequences.clear();
|
||||
do {
|
||||
FamilySequences.emplace_back(Sequences.top().first.Rule,
|
||||
Sequences.top().second);
|
||||
Sequences.pop();
|
||||
} while (!Sequences.empty() && Sequences.top().first == F);
|
||||
// Build a forest node for each unique sequence.
|
||||
sortAndUnique(FamilySequences);
|
||||
auto &SequenceNodes = TempForestNodes;
|
||||
SequenceNodes.clear();
|
||||
for (const auto &SequenceSpec : FamilySequences)
|
||||
SequenceNodes.push_back(&Params.Forest.createSequence(
|
||||
F.Symbol, SequenceSpec.first, SequenceSpec.second));
|
||||
// Wrap in an ambiguous node if needed.
|
||||
const ForestNode *Parsed =
|
||||
SequenceNodes.size() == 1
|
||||
? SequenceNodes.front()
|
||||
: &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
|
||||
LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Params.G) << "\n");
|
||||
|
||||
// Grab the bases for this family.
|
||||
// As well as deduplicating them, we'll group by the goto state.
|
||||
FamilyBases.clear();
|
||||
do {
|
||||
FamilyBases.emplace_back(
|
||||
Params.Table.getGoToState(Bases.top().second->State, F.Symbol),
|
||||
Bases.top().second);
|
||||
Bases.pop();
|
||||
} while (!Bases.empty() && Bases.top().first == F);
|
||||
sortAndUnique(FamilyBases);
|
||||
// Create a GSS node for each unique goto state.
|
||||
llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
|
||||
while (!BasesLeft.empty()) {
|
||||
StateID NextState = BasesLeft.front().first;
|
||||
auto &Parents = TempGSSNodes;
|
||||
Parents.clear();
|
||||
for (const auto &Base : BasesLeft) {
|
||||
if (Base.first != NextState)
|
||||
break;
|
||||
Parents.push_back(Base.second);
|
||||
}
|
||||
BasesLeft = BasesLeft.drop_front(Parents.size());
|
||||
|
||||
// Invoking the callback for new heads, a real GLR parser may add new
|
||||
// reduces to the PendingReduce queue!
|
||||
NewHeadCB(Params.GSS.addNode(NextState, Parsed, Parents));
|
||||
}
|
||||
PopPending();
|
||||
}
|
||||
assert(Sequences.empty());
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
|
@ -0,0 +1,23 @@
|
|||
// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
|
||||
|
||||
void foo() {
|
||||
T* a; // a multiply expression or a pointer declaration?
|
||||
// CHECK: statement-seq~statement := <ambiguous>
|
||||
// CHECK-NEXT: ├─statement~expression-statement := expression ;
|
||||
// CHECK-NEXT: │ ├─expression~multiplicative-expression := multiplicative-expression * pm-expression
|
||||
// CHECK-NEXT: │ │ ├─multiplicative-expression~IDENTIFIER := tok[5]
|
||||
// CHECK-NEXT: │ │ ├─* := tok[6]
|
||||
// CHECK-NEXT: │ │ └─pm-expression~IDENTIFIER := tok[7]
|
||||
// CHECK-NEXT: │ └─; := tok[8]
|
||||
// CHECK-NEXT: └─statement~simple-declaration := decl-specifier-seq init-declarator-list ;
|
||||
// CHECK-NEXT: ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
|
||||
// CHECK-NEXT: │ ├─simple-type-specifier~type-name := <ambiguous>
|
||||
// CHECK-NEXT: │ │ ├─type-name~IDENTIFIER := tok[5]
|
||||
// CHECK-NEXT: │ │ ├─type-name~IDENTIFIER := tok[5]
|
||||
// CHECK-NEXT: │ │ └─type-name~IDENTIFIER := tok[5]
|
||||
// CHECK-NEXT: │ └─simple-type-specifier~IDENTIFIER := tok[5]
|
||||
// CHECK-NEXT: ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
|
||||
// CHECK-NEXT: │ ├─ptr-operator~* := tok[6]
|
||||
// CHECK-NEXT: │ └─ptr-declarator~IDENTIFIER := tok[7]
|
||||
// CHECK-NEXT: └─; := tok[8]
|
||||
}
|
|
@ -7,6 +7,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/DirectiveTree.h"
|
||||
#include "clang-pseudo/GLR.h"
|
||||
#include "clang-pseudo/Grammar.h"
|
||||
#include "clang-pseudo/LRGraph.h"
|
||||
#include "clang-pseudo/LRTable.h"
|
||||
|
@ -35,6 +36,8 @@ static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
|
|||
static opt<bool>
|
||||
PrintDirectiveTree("print-directive-tree",
|
||||
desc("Print directive structure of source code"));
|
||||
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
|
||||
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
|
||||
|
||||
static std::string readOrDie(llvm::StringRef Path) {
|
||||
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
|
||||
|
@ -50,6 +53,28 @@ static std::string readOrDie(llvm::StringRef Path) {
|
|||
int main(int argc, char *argv[]) {
|
||||
llvm::cl::ParseCommandLineOptions(argc, argv, "");
|
||||
|
||||
clang::LangOptions LangOpts; // FIXME: use real options.
|
||||
LangOpts.CPlusPlus = 1;
|
||||
std::string SourceText;
|
||||
llvm::Optional<clang::pseudo::TokenStream> RawStream;
|
||||
llvm::Optional<clang::pseudo::DirectiveTree> DirectiveStructure;
|
||||
llvm::Optional<clang::pseudo::TokenStream> ParseableStream;
|
||||
if (Source.getNumOccurrences()) {
|
||||
SourceText = readOrDie(Source);
|
||||
RawStream = clang::pseudo::lex(SourceText, LangOpts);
|
||||
DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream);
|
||||
clang::pseudo::chooseConditionalBranches(*DirectiveStructure, *RawStream);
|
||||
|
||||
if (PrintDirectiveTree)
|
||||
llvm::outs() << DirectiveStructure;
|
||||
if (PrintSource)
|
||||
RawStream->print(llvm::outs());
|
||||
if (PrintTokens)
|
||||
llvm::outs() << RawStream;
|
||||
|
||||
ParseableStream = clang::pseudo::stripComments(cook(*RawStream, LangOpts));
|
||||
}
|
||||
|
||||
if (Grammar.getNumOccurrences()) {
|
||||
std::string Text = readOrDie(Grammar);
|
||||
std::vector<std::string> Diags;
|
||||
|
@ -65,24 +90,26 @@ int main(int argc, char *argv[]) {
|
|||
llvm::outs() << G->dump();
|
||||
if (PrintGraph)
|
||||
llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
|
||||
auto LRTable = clang::pseudo::LRTable::buildSLR(*G);
|
||||
if (PrintTable)
|
||||
llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G);
|
||||
return 0;
|
||||
llvm::outs() << LRTable.dumpForTests(*G);
|
||||
|
||||
if (ParseableStream) {
|
||||
clang::pseudo::ForestArena Arena;
|
||||
clang::pseudo::GSS GSS;
|
||||
auto &Root =
|
||||
glrParse(*ParseableStream,
|
||||
clang::pseudo::ParseParams{*G, LRTable, Arena, GSS});
|
||||
if (PrintForest)
|
||||
llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true);
|
||||
|
||||
if (PrintStatistics) {
|
||||
llvm::outs() << "Forest bytes: " << Arena.bytes()
|
||||
<< " nodes: " << Arena.nodeCount() << "\n";
|
||||
llvm::outs() << "GSS bytes: " << GSS.bytes()
|
||||
<< " nodes: " << GSS.nodeCount() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (Source.getNumOccurrences()) {
|
||||
std::string Text = readOrDie(Source);
|
||||
clang::LangOptions LangOpts; // FIXME: use real options.
|
||||
auto Stream = clang::pseudo::lex(Text, LangOpts);
|
||||
auto Structure = clang::pseudo::DirectiveTree::parse(Stream);
|
||||
clang::pseudo::chooseConditionalBranches(Structure, Stream);
|
||||
|
||||
if (PrintDirectiveTree)
|
||||
llvm::outs() << Structure;
|
||||
if (PrintSource)
|
||||
Stream.print(llvm::outs());
|
||||
if (PrintTokens)
|
||||
llvm::outs() << Stream;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -6,6 +6,7 @@ add_custom_target(ClangPseudoUnitTests)
|
|||
add_unittest(ClangPseudoUnitTests ClangPseudoTests
|
||||
DirectiveTreeTest.cpp
|
||||
ForestTest.cpp
|
||||
GLRTest.cpp
|
||||
GrammarTest.cpp
|
||||
LRTableTest.cpp
|
||||
TokenTest.cpp
|
||||
|
|
|
@ -0,0 +1,393 @@
|
|||
//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/GLR.h"
|
||||
#include "clang-pseudo/Grammar.h"
|
||||
#include "clang-pseudo/Token.h"
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <memory>
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
using Action = LRTable::Action;
|
||||
using std::placeholders::_1;
|
||||
using std::placeholders::_2;
|
||||
using std::placeholders::_3;
|
||||
using testing::AllOf;
|
||||
|
||||
MATCHER_P(state, StateID, "") { return arg->State == StateID; }
|
||||
MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
|
||||
MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
|
||||
|
||||
|
||||
// llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const NewHeadResult &R) {
|
||||
// std::vector<std::string> ParentStates;
|
||||
// for (const auto &P : R.Parents)
|
||||
// ParentStates.push_back(llvm::formatv("{0}", P->State));
|
||||
// OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", R.State,
|
||||
// R.Parsed->symbol(), llvm::join(ParentStates, " "));
|
||||
// return OS;
|
||||
// }
|
||||
|
||||
testing::Matcher<const GSS::Node *>
|
||||
parents(llvm::ArrayRef<const GSS::Node *> Parents) {
|
||||
return testing::Property(&GSS::Node::parents,
|
||||
testing::UnorderedElementsAreArray(Parents));
|
||||
}
|
||||
|
||||
class GLRTest : public ::testing::Test {
|
||||
public:
|
||||
void build(llvm::StringRef GrammarBNF) {
|
||||
std::vector<std::string> Diags;
|
||||
G = Grammar::parseBNF(GrammarBNF, Diags);
|
||||
}
|
||||
|
||||
void buildGrammar(std::vector<std::string> Nonterminals,
|
||||
std::vector<std::string> Rules) {
|
||||
Nonterminals.push_back("_");
|
||||
llvm::sort(Nonterminals);
|
||||
Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()),
|
||||
Nonterminals.end());
|
||||
std::string FakeTestBNF;
|
||||
for (const auto &NT : Nonterminals)
|
||||
FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT);
|
||||
FakeTestBNF += llvm::join(Rules, "\n");
|
||||
build(FakeTestBNF);
|
||||
}
|
||||
|
||||
SymbolID id(llvm::StringRef Name) const {
|
||||
for (unsigned I = 0; I < NumTerminals; ++I)
|
||||
if (G->table().Terminals[I] == Name)
|
||||
return tokenSymbol(static_cast<tok::TokenKind>(I));
|
||||
for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
|
||||
if (G->table().Nonterminals[ID].Name == Name)
|
||||
return ID;
|
||||
ADD_FAILURE() << "No such symbol found: " << Name;
|
||||
return 0;
|
||||
}
|
||||
|
||||
RuleID ruleFor(llvm::StringRef NonterminalName) const {
|
||||
auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange;
|
||||
if (RuleRange.End - RuleRange.Start == 1)
|
||||
return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start;
|
||||
ADD_FAILURE() << "Expected a single rule for " << NonterminalName
|
||||
<< ", but it has " << RuleRange.End - RuleRange.Start
|
||||
<< " rule!\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
NewHeadCallback captureNewHeads() {
|
||||
return [this](const GSS::Node *NewHead) {
|
||||
NewHeadResults.push_back(NewHead);
|
||||
};
|
||||
};
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Grammar> G;
|
||||
ForestArena Arena;
|
||||
GSS GSS;
|
||||
std::vector<const GSS::Node*> NewHeadResults;
|
||||
};
|
||||
|
||||
TEST_F(GLRTest, ShiftMergingHeads) {
|
||||
// Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
|
||||
// 2 have shift actions to reach state 4, and the head 3 has a shift action to
|
||||
// reach state 5:
|
||||
// 0--1
|
||||
// └--2
|
||||
// └--3
|
||||
// After the shift action, the GSS (with new heads 4, 5) is:
|
||||
// 0---1---4
|
||||
// └---2---┘
|
||||
// └---3---5
|
||||
auto *GSSNode0 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
||||
auto *GSSNode1 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0});
|
||||
auto *GSSNode2 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0});
|
||||
auto *GSSNode3 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{GSSNode0});
|
||||
|
||||
buildGrammar({}, {}); // Create a fake empty grammar.
|
||||
LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{});
|
||||
|
||||
ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
|
||||
std::vector<ParseStep> PendingShift = {
|
||||
{GSSNode1, Action::shift(4)},
|
||||
{GSSNode3, Action::shift(5)},
|
||||
{GSSNode2, Action::shift(4)},
|
||||
};
|
||||
glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSS},
|
||||
captureNewHeads());
|
||||
|
||||
EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
|
||||
AllOf(state(4), parsedSymbol(&SemiTerminal),
|
||||
parents({GSSNode1, GSSNode2})),
|
||||
AllOf(state(5), parsedSymbol(&SemiTerminal),
|
||||
parents({GSSNode3}))));
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, ReduceConflictsSplitting) {
|
||||
// Before (splitting due to R/R conflict):
|
||||
// 0--1(IDENTIFIER)
|
||||
// After reducing 1 by `class-name := IDENTIFIER` and
|
||||
// `enum-name := IDENTIFIER`:
|
||||
// 0--2(class-name) // 2 is goto(0, class-name)
|
||||
// └--3(enum-name) // 3 is goto(0, enum-name)
|
||||
buildGrammar({"class-name", "enum-name"},
|
||||
{"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
|
||||
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)},
|
||||
{/*State=*/0, id("enum-name"), Action::goTo(3)}});
|
||||
|
||||
const auto *GSSNode0 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
||||
const auto *GSSNode1 =
|
||||
GSS.addNode(3, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
|
||||
|
||||
std::vector<ParseStep> PendingReduce = {
|
||||
{GSSNode1, Action::reduce(ruleFor("class-name"))},
|
||||
{GSSNode1, Action::reduce(ruleFor("enum-name"))}};
|
||||
glrReduce(PendingReduce, {*G, Table, Arena, GSS},
|
||||
captureNewHeads());
|
||||
// Verify
|
||||
EXPECT_THAT(NewHeadResults,
|
||||
testing::UnorderedElementsAre(
|
||||
AllOf(state(2), parsedSymbolID(id("class-name")),
|
||||
parents({GSSNode0})),
|
||||
AllOf(state(3), parsedSymbolID(id("enum-name")),
|
||||
parents({GSSNode0}))));
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
|
||||
// Before (splitting due to multiple bases):
|
||||
// 2(class-name)--4(*)
|
||||
// 3(enum-name)---┘
|
||||
// After reducing 4 by `ptr-operator := *`:
|
||||
// 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
|
||||
// 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
|
||||
buildGrammar({"ptr-operator", "class-name", "enum-name"},
|
||||
{"ptr-operator := *"});
|
||||
|
||||
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
|
||||
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
|
||||
|
||||
const auto *GSSNode2 =
|
||||
GSS.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
|
||||
const auto *GSSNode3 =
|
||||
GSS.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
|
||||
const auto *GSSNode4 = GSS.addNode(
|
||||
/*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
|
||||
/*Parents=*/{GSSNode2, GSSNode3});
|
||||
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G->table(),
|
||||
{{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)},
|
||||
{/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}});
|
||||
std::vector<ParseStep> PendingReduce = {
|
||||
{GSSNode4, Action::reduce(ruleFor("ptr-operator"))}};
|
||||
glrReduce(PendingReduce, {*G, Table, Arena, GSS},
|
||||
captureNewHeads());
|
||||
|
||||
EXPECT_THAT(NewHeadResults,
|
||||
testing::UnorderedElementsAre(
|
||||
AllOf(state(5), parsedSymbolID(id("ptr-operator")),
|
||||
parents({GSSNode2})),
|
||||
AllOf(state(6), parsedSymbolID(id("ptr-operator")),
|
||||
parents({GSSNode3}))));
|
||||
// Verify that the payload of the two new heads is shared, only a single
|
||||
// ptr-operator node is created in the forest.
|
||||
EXPECT_EQ(NewHeadResults[0]->Payload, NewHeadResults[1]->Payload);
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
|
||||
// Before (joining due to same goto state, multiple bases):
|
||||
// 0--1(cv-qualifier)--3(class-name)
|
||||
// └--2(cv-qualifier)--4(enum-name)
|
||||
// After reducing 3 by `type-name := class-name` and
|
||||
// 4 by `type-name := enum-name`:
|
||||
// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
|
||||
// └--2(cv-qualifier)--┘ // goto(2, type-name)
|
||||
buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
|
||||
{"type-name := class-name", "type-name := enum-name"});
|
||||
|
||||
auto *CVQualifierNode =
|
||||
&Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
|
||||
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
|
||||
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
|
||||
|
||||
const auto *GSSNode0 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
||||
const auto *GSSNode1 = GSS.addNode(
|
||||
/*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode2 = GSS.addNode(
|
||||
/*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode3 = GSS.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
|
||||
/*Parents=*/{GSSNode1});
|
||||
const auto *GSSNode4 = GSS.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
|
||||
/*Parents=*/{GSSNode2});
|
||||
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G->table(),
|
||||
{{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)},
|
||||
{/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}});
|
||||
// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
|
||||
std::vector<ParseStep> PendingReduce = {
|
||||
{
|
||||
GSSNode3, Action::reduce(/*RuleID=*/0) // type-name := class-name
|
||||
},
|
||||
{
|
||||
GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name
|
||||
}};
|
||||
glrReduce(PendingReduce, {*G, Table, Arena, GSS},
|
||||
captureNewHeads());
|
||||
|
||||
// Verify that the stack heads are joint at state 5 after reduces.
|
||||
EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(AllOf(
|
||||
state(5), parsedSymbolID(id("type-name")),
|
||||
parents({GSSNode1, GSSNode2}))));
|
||||
// Verify that we create an ambiguous ForestNode of two parses of `type-name`.
|
||||
EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G),
|
||||
"[ 1, end) type-name := <ambiguous>\n"
|
||||
"[ 1, end) ├─type-name := class-name\n"
|
||||
"[ 1, end) │ └─class-name := <opaque>\n"
|
||||
"[ 1, end) └─type-name := enum-name\n"
|
||||
"[ 1, end) └─enum-name := <opaque>\n");
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, ReduceJoiningWithSameBase) {
|
||||
// Before (joining due to same goto state, the same base):
|
||||
// 0--1(class-name)--3(*)
|
||||
// └--2(enum-name)--4(*)
|
||||
// After reducing 3 by `pointer := class-name *` and
|
||||
// 2 by `pointer := enum-name *`:
|
||||
// 0--5(pointer) // 5 is goto(0, pointer)
|
||||
buildGrammar({"pointer", "class-name", "enum-name"},
|
||||
{"pointer := class-name *", "pointer := enum-name *"});
|
||||
|
||||
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
|
||||
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
|
||||
auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
|
||||
|
||||
const auto *GSSNode0 =
|
||||
GSS.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
||||
const auto *GSSNode1 = GSS.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
|
||||
/*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode2 = GSS.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
|
||||
/*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode3 = GSS.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
|
||||
/*Parents=*/{GSSNode1});
|
||||
const auto *GSSNode4 = GSS.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
|
||||
/*Parents=*/{GSSNode2});
|
||||
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}});
|
||||
// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
|
||||
std::vector<ParseStep> PendingReduce = {
|
||||
{
|
||||
GSSNode3, Action::reduce(/*RuleID=*/0) // pointer := class-name *
|
||||
},
|
||||
{
|
||||
GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name *
|
||||
}};
|
||||
glrReduce(PendingReduce, {*G, Table, Arena, GSS},
|
||||
captureNewHeads());
|
||||
|
||||
EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
|
||||
AllOf(state(5), parsedSymbolID(id("pointer")),
|
||||
parents({GSSNode0}))));
|
||||
EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G),
|
||||
"[ 0, end) pointer := <ambiguous>\n"
|
||||
"[ 0, end) ├─pointer := class-name *\n"
|
||||
"[ 0, 1) │ ├─class-name := <opaque>\n"
|
||||
"[ 1, end) │ └─* := tok[1]\n"
|
||||
"[ 0, end) └─pointer := enum-name *\n"
|
||||
"[ 0, 1) ├─enum-name := <opaque>\n"
|
||||
"[ 1, end) └─* := tok[1]\n");
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, PerfectForestNodeSharing) {
|
||||
// Run the GLR on a simple grammar and test that we build exactly one forest
|
||||
// node per (SymbolID, token range).
|
||||
|
||||
// This is a grmammar where the original parsing-stack-based forest node
|
||||
// sharing approach will fail. In its LR0 graph, it has two states containing
|
||||
// item `expr := • IDENTIFIER`, and both have different goto states on the
|
||||
// nonterminal `expr`.
|
||||
build(R"bnf(
|
||||
_ := test
|
||||
|
||||
test := { expr
|
||||
test := { IDENTIFIER
|
||||
test := left-paren expr
|
||||
left-paren := {
|
||||
expr := IDENTIFIER
|
||||
)bnf");
|
||||
clang::LangOptions LOptions;
|
||||
const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
|
||||
auto LRTable = LRTable::buildSLR(*G);
|
||||
|
||||
const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSS});
|
||||
// Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
|
||||
// in the forest, see the `#1` and `=#1` in the dump string.
|
||||
EXPECT_EQ(Parsed.dumpRecursive(*G),
|
||||
"[ 0, end) test := <ambiguous>\n"
|
||||
"[ 0, end) ├─test := { expr\n"
|
||||
"[ 0, 1) │ ├─{ := tok[0]\n"
|
||||
"[ 1, end) │ └─expr := IDENTIFIER #1\n"
|
||||
"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
|
||||
"[ 0, end) ├─test := { IDENTIFIER\n"
|
||||
"[ 0, 1) │ ├─{ := tok[0]\n"
|
||||
"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
|
||||
"[ 0, end) └─test := left-paren expr\n"
|
||||
"[ 0, 1) ├─left-paren := {\n"
|
||||
"[ 0, 1) │ └─{ := tok[0]\n"
|
||||
"[ 1, end) └─expr := IDENTIFIER =#1\n"
|
||||
"[ 1, end) └─IDENTIFIER := tok[1]\n");
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, GLRReduceOrder) {
|
||||
// Given the following grammar, and the input `IDENTIFIER`, reductions should
|
||||
// be performed in the following order:
|
||||
// 1. foo := IDENTIFIER
|
||||
// 2. { test := IDENTIFIER, test := foo }
|
||||
// foo should be reduced first, so that in step 2 we have completed reduces
|
||||
// for test, and form an ambiguous forest node.
|
||||
build(R"bnf(
|
||||
_ := test
|
||||
|
||||
test := IDENTIFIER
|
||||
test := foo
|
||||
foo := IDENTIFIER
|
||||
)bnf");
|
||||
clang::LangOptions LOptions;
|
||||
const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
|
||||
auto LRTable = LRTable::buildSLR(*G);
|
||||
|
||||
const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSS});
|
||||
EXPECT_EQ(Parsed.dumpRecursive(*G),
|
||||
"[ 0, end) test := <ambiguous>\n"
|
||||
"[ 0, end) ├─test := IDENTIFIER\n"
|
||||
"[ 0, end) │ └─IDENTIFIER := tok[0]\n"
|
||||
"[ 0, end) └─test := foo\n"
|
||||
"[ 0, end) └─foo := IDENTIFIER\n"
|
||||
"[ 0, end) └─IDENTIFIER := tok[0]\n");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
Loading…
Reference in New Issue