forked from OSchip/llvm-project
181 lines
6.9 KiB
C++
181 lines
6.9 KiB
C++
//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// The LRTable (referred as LR parsing table in the LR literature) is the core
|
|
// component in LR parsers, it drives the LR parsers by specifying an action to
|
|
// take given the current state on the top of the stack and the current
|
|
// lookahead token.
|
|
//
|
|
// The LRTable can be described as a matrix where the rows represent
|
|
// the states of the LR graph, the columns represent the symbols of the
|
|
// grammar, and each entry of the matrix (called action) represents a
|
|
// state transition in the graph.
|
|
//
|
|
// Typically, based on the category of the grammar symbol, the LRTable is
|
|
// broken into two logically separate tables:
|
|
// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
|
|
// next action (shift/reduce/accept/error) on state S under a lookahead
|
|
// terminal a
|
|
// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
|
|
// the state which we transist to from the state S with the nonterminal X
|
|
//
|
|
// LRTable is *performance-critial* as it is consulted frequently during a
|
|
// parse. In general, LRTable is very sparse (most of the entries are empty).
|
|
// For example, for the C++ language, the SLR table has ~1500 states and 650
|
|
// symbols which results in a matrix having 975K entries, ~90% of entries are
|
|
// empty.
|
|
//
|
|
// This file implements a speed-and-space-efficient LRTable.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef CLANG_PSEUDO_LRTABLE_H
|
|
#define CLANG_PSEUDO_LRTABLE_H
|
|
|
|
#include "clang-pseudo/Grammar.h"
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include <cstdint>
|
|
#include <vector>
|
|
|
|
namespace clang {
|
|
namespace pseudo {
|
|
|
|
// Represents the LR parsing table, which can efficiently the question "what is
|
|
// the next step given the lookahead token and current state on top of the
|
|
// stack?".
|
|
//
|
|
// This is a dense implementation, which only takes an amount of space that is
|
|
// proportional to the number of non-empty entries in the table.
|
|
//
|
|
// Unlike the typical LR parsing table which allows at most one available action
|
|
// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
|
|
// to be used in nondeterministic LR parsers (e.g. GLR).
|
|
class LRTable {
|
|
public:
|
|
// StateID is only 13 bits wide.
|
|
using StateID = uint16_t;
|
|
static constexpr unsigned StateBits = 13;
|
|
|
|
// Action represents the terminal and nonterminal actions, it combines the
|
|
// entry of the ACTION and GOTO tables from the LR literature.
|
|
class Action {
|
|
public:
|
|
enum Kind : uint8_t {
|
|
Sentinel = 0,
|
|
// Terminal actions, corresponding to entries of ACTION table.
|
|
|
|
// Shift to state n: move forward with the lookahead, and push state n
|
|
// onto the state stack.
|
|
// A shift is a forward transition, and the value n is the next state that
|
|
// the parser is to enter.
|
|
Shift,
|
|
// Reduce by a rule: pop the state stack.
|
|
Reduce,
|
|
// Signals that we have parsed the input successfully.
|
|
Accept,
|
|
|
|
// Nonterminal actions, corresponding to entry of GOTO table.
|
|
|
|
// Go to state n: push state n onto the state stack.
|
|
// Similar to Shift, but it is a nonterminal forward transition.
|
|
GoTo,
|
|
};
|
|
|
|
static Action accept(RuleID RID) { return Action(Accept, RID); }
|
|
static Action goTo(StateID S) { return Action(GoTo, S); }
|
|
static Action shift(StateID S) { return Action(Shift, S); }
|
|
static Action reduce(RuleID RID) { return Action(Reduce, RID); }
|
|
static Action sentinel() { return Action(Sentinel, 0); }
|
|
|
|
StateID getShiftState() const {
|
|
assert(kind() == Shift);
|
|
return Value;
|
|
}
|
|
StateID getGoToState() const {
|
|
assert(kind() == GoTo);
|
|
return Value;
|
|
}
|
|
RuleID getReduceRule() const {
|
|
assert(kind() == Reduce);
|
|
return Value;
|
|
}
|
|
Kind kind() const { return static_cast<Kind>(K); }
|
|
|
|
bool operator==(const Action &L) const { return opaque() == L.opaque(); }
|
|
uint16_t opaque() const { return K << ValueBits | Value; };
|
|
|
|
private:
|
|
Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
|
|
static constexpr unsigned ValueBits = StateBits;
|
|
static constexpr unsigned KindBits = 3;
|
|
static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
|
|
static_assert(KindBits + ValueBits <= 16,
|
|
"Must be able to store kind and value efficiently");
|
|
uint16_t K : KindBits;
|
|
// Either StateID or RuleID, depending on the Kind.
|
|
uint16_t Value : ValueBits;
|
|
};
|
|
|
|
// Returns all available actions for the given state on a terminal.
|
|
// Expected to be called by LR parsers.
|
|
llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
|
|
// Returns the state after we reduce a nonterminal.
|
|
// Expected to be called by LR parsers.
|
|
StateID getGoToState(StateID State, SymbolID Nonterminal) const;
|
|
|
|
// Looks up available actions.
|
|
// Returns empty if no available actions in the table.
|
|
llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
|
|
|
|
size_t bytes() const {
|
|
return sizeof(*this) + Actions.capacity() * sizeof(Action) +
|
|
States.capacity() * sizeof(StateID) +
|
|
NontermOffset.capacity() * sizeof(uint32_t) +
|
|
TerminalOffset.capacity() * sizeof(uint32_t);
|
|
}
|
|
|
|
std::string dumpStatistics() const;
|
|
std::string dumpForTests(const Grammar &G) const;
|
|
|
|
// Build a SLR(1) parsing table.
|
|
static LRTable buildSLR(const Grammar &G);
|
|
|
|
class Builder;
|
|
// Represents an entry in the table, used for building the LRTable.
|
|
struct Entry {
|
|
StateID State;
|
|
SymbolID Symbol;
|
|
Action Act;
|
|
};
|
|
// Build a specifid table for testing purposes.
|
|
static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
|
|
|
|
private:
|
|
// Conceptually the LR table is a multimap from (State, SymbolID) => Action.
|
|
// Our physical representation is quite different for compactness.
|
|
|
|
// Index is nonterminal SymbolID, value is the offset into States/Actions
|
|
// where the entries for this nonterminal begin.
|
|
// Give a nonterminal id, the corresponding half-open range of StateIdx is
|
|
// [NontermIdx[id], NontermIdx[id+1]).
|
|
std::vector<uint32_t> NontermOffset;
|
|
// Similar to NontermOffset, but for terminals, index is tok::TokenKind.
|
|
std::vector<uint32_t> TerminalOffset;
|
|
// Parallel to Actions, the value is State (rows of the matrix).
|
|
// Grouped by the SymbolID, and only subranges are sorted.
|
|
std::vector<StateID> States;
|
|
// A flat list of available actions, sorted by (SymbolID, State).
|
|
std::vector<Action> Actions;
|
|
};
|
|
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
|
|
|
|
} // namespace pseudo
|
|
} // namespace clang
|
|
|
|
#endif // CLANG_PSEUDO_LRTABLE_H
|