llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h

176 lines
5.9 KiB
C++

//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// LR parsers are bottom-up parsers -- they scan the input from left to right,
// and collect the right-hand side of a production rule (called handle) on top
// of the stack, then replace (reduce) the handle with the nonterminal defined
// by the production rule.
//
// This file defines LRGraph, a deterministic handle-finding finite-state
// automaton, which is a key component in LR parsers to recognize any of
// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
// Table) based on the LRGraph.
//
// LRGraph can be constructed for any context-free grammars.
// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
// interpretation of the FSA is nondeterministic -- we might in a state where
// we can continue searching an handle and identify a handle (called
// shift/reduce conflicts), or identify more than one handle (callled
// reduce/reduce conflicts).
//
// LRGraph is a common model for all variants of LR automatons, from the most
// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
// in making decisions.
//===----------------------------------------------------------------------===//
#ifndef CLANG_PSEUDO_LRGRAPH_H
#define CLANG_PSEUDO_LRGRAPH_H
#include "clang-pseudo/Grammar.h"
#include "llvm/ADT/Hashing.h"
#include <vector>
namespace clang {
namespace pseudo {
// An LR item -- a grammar rule with a dot at some position of the body.
// e.g. a production rule A := X Y yields 3 items:
// A := . X Y
// A := X . Y
// A := X Y .
// An item indicates how much of a production rule has been recognized at a
// position (described by dot), for example, A := X . Y indicates that we have
// recognized the X part from the input, and we hope next to see the input
// derivable from Y.
class Item {
public:
static Item start(RuleID ID, const Grammar &G) {
Item I;
I.RID = ID;
I.RuleLength = G.lookupRule(ID).Size;
return I;
}
static Item sentinel(RuleID ID) {
Item I;
I.RID = ID;
return I;
}
RuleID rule() const { return RID; }
uint8_t dot() const { return DotPos; }
bool hasNext() const { return DotPos < RuleLength; }
SymbolID next(const Grammar &G) const {
assert(hasNext());
return G.lookupRule(RID).Sequence[DotPos];
}
Item advance() const {
assert(hasNext());
Item I = *this;
++I.DotPos;
return I;
}
std::string dump(const Grammar &G) const;
bool operator==(const Item &I) const {
return DotPos == I.DotPos && RID == I.RID;
}
bool operator<(const Item &I) const {
return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
}
friend llvm::hash_code hash_value(const Item &I) {
return llvm::hash_combine(I.RID, I.DotPos);
}
private:
RuleID RID = 0;
uint8_t DotPos = 0;
uint8_t RuleLength = 0; // the length of rule body.
};
// A state represents a node in the LR automaton graph. It is an item set, which
// contains all possible rules that the LR parser may be parsing in that state.
//
// Conceptually, If we knew in advance what we're parsing, at any point we're
// partway through parsing a production, sitting on a stack of partially parsed
// productions. But because we don't know, there could be *several* productions
// we're partway through. The set of possibilities is the parser state, and we
// precompute all the transitions between these states.
struct State {
// A full set of items (including non-kernel items) representing the state,
// in a canonical order (see SortByNextSymbol in the cpp file).
std::vector<Item> Items;
std::string dump(const Grammar &G, unsigned Indent = 0) const;
};
// LRGraph is a deterministic finite state automaton for LR parsing.
//
// Intuitively, an LR automaton is a transition graph. The graph has a
// collection of nodes, called States. Each state corresponds to a particular
// item set, which represents a condition that could occur during the process of
// parsing a production. Edges are directed from one state to another. Each edge
// is labeled by a grammar symbol (terminal or nonterminal).
//
// LRGraph is used to construct the LR parsing table which is a core
// data-structure driving the LR parser.
class LRGraph {
public:
// StateID is the index in States table.
using StateID = uint16_t;
// Constructs an LR(0) automaton.
static LRGraph buildLR0(const Grammar &);
// An edge in the LR graph, it represents a transition in the LR automaton.
// If the parser is at state Src, with a lookahead Label, then it
// transits to state Dst.
struct Edge {
StateID Src, Dst;
SymbolID Label;
};
llvm::ArrayRef<State> states() const { return States; }
llvm::ArrayRef<Edge> edges() const { return Edges; }
std::string dumpForTests(const Grammar &) const;
private:
LRGraph(std::vector<State> States, std::vector<Edge> Edges)
: States(std::move(States)), Edges(std::move(Edges)) {}
std::vector<State> States;
std::vector<Edge> Edges;
};
} // namespace pseudo
} // namespace clang
namespace llvm {
// Support clang::pseudo::Item as DenseMap keys.
template <> struct DenseMapInfo<clang::pseudo::Item> {
static inline clang::pseudo::Item getEmptyKey() {
return clang::pseudo::Item::sentinel(-1);
}
static inline clang::pseudo::Item getTombstoneKey() {
return clang::pseudo::Item::sentinel(-2);
}
static unsigned getHashValue(const clang::pseudo::Item &I) {
return hash_value(I);
}
static bool isEqual(const clang::pseudo::Item &LHS,
const clang::pseudo::Item &RHS) {
return LHS == RHS;
}
};
} // namespace llvm
#endif // CLANG_PSEUDO_LRGRAPH_H