llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h

//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  The LRTable (referred as LR parsing table in the LR literature) is the core
//  component in LR parsers, it drives the LR parsers by specifying an action to
//  take given the current state on the top of the stack and the current
//  lookahead token.
//
//  The LRTable can be described as a matrix where the rows represent
//  the states of the LR graph, the columns represent the symbols of the
//  grammar, and each entry of the matrix (called action) represents a
//  state transition in the graph.
//
//  Typically, based on the category of the grammar symbol, the LRTable is
//  broken into two logically separate tables:
//    - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
//      next action (shift/reduce/accept/error) on state S under a lookahead
//      terminal a
//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
//      the state which we transist to from the state S with the nonterminal X
//
//  LRTable is *performance-critial* as it is consulted frequently during a
//  parse. In general, LRTable is very sparse (most of the entries are empty).
//  For example, for the C++ language, the SLR table has ~1500 states and 650
//  symbols which results in a matrix having 975K entries, ~90% of entries are
//  empty.
//
//  This file implements a speed-and-space-efficient LRTable.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_LRTABLE_H
#define CLANG_PSEUDO_LRTABLE_H

#include "clang-pseudo/Grammar.h"
#include "llvm/ADT/ArrayRef.h"
#include <cstdint>
#include <vector>

namespace clang {
namespace pseudo {

// Represents the LR parsing table, which can efficiently the question "what is
// the next step given the lookahead token and current state on top of the
// stack?".
//
// This is a dense implementation, which only takes an amount of space that is
// proportional to the number of non-empty entries in the table.
//
// Unlike the typical LR parsing table which allows at most one available action
// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
// to be used in nondeterministic LR parsers (e.g. GLR).
class LRTable {
public:
  // StateID is only 13 bits wide.
  using StateID = uint16_t;
  static constexpr unsigned StateBits = 13;

  // Action represents the terminal and nonterminal actions, it combines the
  // entry of the ACTION and GOTO tables from the LR literature.
  class Action {
  public:
    enum Kind : uint8_t {
      Sentinel = 0,
      // Terminal actions, corresponding to entries of ACTION table.

      // Shift to state n: move forward with the lookahead, and push state n
      // onto the state stack.
      // A shift is a forward transition, and the value n is the next state that
      // the parser is to enter.
      Shift,
      // Reduce by a rule: pop the state stack.
      Reduce,
      // Signals that we have parsed the input successfully.
      Accept,

      // Nonterminal actions, corresponding to entry of GOTO table.

      // Go to state n: push state n onto the state stack.
      // Similar to Shift, but it is a nonterminal forward transition.
      GoTo,
    };

    static Action accept(RuleID RID) { return Action(Accept, RID); }
    static Action goTo(StateID S) { return Action(GoTo, S); }
    static Action shift(StateID S) { return Action(Shift, S); }
    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
    static Action sentinel() { return Action(Sentinel, 0); }

    StateID getShiftState() const {
      assert(kind() == Shift);
      return Value;
    }
    StateID getGoToState() const {
      assert(kind() == GoTo);
      return Value;
    }
    RuleID getReduceRule() const {
      assert(kind() == Reduce);
      return Value;
    }
    Kind kind() const { return static_cast<Kind>(K); }

    bool operator==(const Action &L) const { return opaque() == L.opaque(); }
    uint16_t opaque() const { return K << ValueBits | Value; };

  private:
    Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
    static constexpr unsigned ValueBits = StateBits;
    static constexpr unsigned KindBits = 3;
    static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
    static_assert(KindBits + ValueBits <= 16,
                  "Must be able to store kind and value efficiently");
    uint16_t K : KindBits;
    // Either StateID or RuleID, depending on the Kind.
    uint16_t Value : ValueBits;
  };

  // Returns all available actions for the given state on a terminal.
  // Expected to be called by LR parsers.
  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
  // Returns the state after we reduce a nonterminal.
  // Expected to be called by LR parsers.
  StateID getGoToState(StateID State, SymbolID Nonterminal) const;

  // Looks up available actions.
  // Returns empty if no available actions in the table.
  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;

  size_t bytes() const {
    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
           States.capacity() * sizeof(StateID) +
           NontermOffset.capacity() * sizeof(uint32_t) +
           TerminalOffset.capacity() * sizeof(uint32_t);
  }

  std::string dumpStatistics() const;
  std::string dumpForTests(const Grammar &G) const;

  // Build a SLR(1) parsing table.
  static LRTable buildSLR(const Grammar &G);

  class Builder;
  // Represents an entry in the table, used for building the LRTable.
  struct Entry {
    StateID State;
    SymbolID Symbol;
    Action Act;
  };
  // Build a specifid table for testing purposes.
  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);

private:
  // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
  // Our physical representation is quite different for compactness.

  // Index is nonterminal SymbolID, value is the offset into States/Actions
  // where the entries for this nonterminal begin.
  // Give a nonterminal id, the corresponding half-open range of StateIdx is
  // [NontermIdx[id], NontermIdx[id+1]).
  std::vector<uint32_t> NontermOffset;
  // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
  std::vector<uint32_t> TerminalOffset;
  // Parallel to Actions, the value is State (rows of the matrix).
  // Grouped by the SymbolID, and only subranges are sorted.
  std::vector<StateID> States;
  // A flat list of available actions, sorted by (SymbolID, State).
  std::vector<Action> Actions;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_LRTABLE_H