forked from OSchip/llvm-project
173 lines
6.0 KiB
Python
173 lines
6.0 KiB
Python
|
#!/usr/bin/env python
|
||
|
#===- cppreference_parser.py - ------------------------------*- python -*--===#
|
||
|
#
|
||
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
# See https://llvm.org/LICENSE.txt for license information.
|
||
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
#
|
||
|
#===------------------------------------------------------------------------===#
|
||
|
|
||
|
from bs4 import BeautifulSoup, NavigableString
|
||
|
|
||
|
import collections
|
||
|
import multiprocessing
|
||
|
import os
|
||
|
import re
|
||
|
import signal
|
||
|
import sys
|
||
|
|
||
|
|
||
|
class Symbol:
|
||
|
|
||
|
def __init__(self, name, namespace, headers):
|
||
|
# unqualifed symbol name, e.g. "move"
|
||
|
self.name = name
|
||
|
# namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
|
||
|
# None for C symbols.
|
||
|
self.namespace = namespace
|
||
|
# a list of corresponding headers
|
||
|
self.headers = headers
|
||
|
|
||
|
|
||
|
def _HasClass(tag, *classes):
|
||
|
for c in tag.get('class', []):
|
||
|
if c in classes:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
def _ParseSymbolPage(symbol_page_html, symbol_name):
|
||
|
"""Parse symbol page and retrieve the include header defined in this page.
|
||
|
The symbol page provides header for the symbol, specifically in
|
||
|
"Defined in header <header>" section. An example:
|
||
|
|
||
|
<tr class="t-dsc-header">
|
||
|
<td colspan="2"> <div>Defined in header <code><ratio></code> </div>
|
||
|
</td></tr>
|
||
|
|
||
|
Returns a list of headers.
|
||
|
"""
|
||
|
headers = set()
|
||
|
all_headers = set()
|
||
|
|
||
|
soup = BeautifulSoup(symbol_page_html, "html.parser")
|
||
|
# Rows in table are like:
|
||
|
# Defined in header <foo> .t-dsc-header
|
||
|
# Defined in header <bar> .t-dsc-header
|
||
|
# decl1 .t-dcl
|
||
|
# Defined in header <baz> .t-dsc-header
|
||
|
# decl2 .t-dcl
|
||
|
for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
|
||
|
current_headers = []
|
||
|
was_decl = False
|
||
|
for row in table.select('tr'):
|
||
|
if _HasClass(row, 't-dcl', 't-dsc'):
|
||
|
was_decl = True
|
||
|
# Symbols are in the first cell.
|
||
|
found_symbols = row.find('td').stripped_strings
|
||
|
if not symbol_name in found_symbols:
|
||
|
continue
|
||
|
headers.update(current_headers)
|
||
|
elif _HasClass(row, 't-dsc-header'):
|
||
|
# If we saw a decl since the last header, this is a new block of headers
|
||
|
# for a new block of decls.
|
||
|
if was_decl:
|
||
|
current_headers = []
|
||
|
was_decl = False
|
||
|
# There are also .t-dsc-header for "defined in namespace".
|
||
|
if not "Defined in header " in row.text:
|
||
|
continue
|
||
|
# The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
|
||
|
for header_code in row.find_all("code"):
|
||
|
current_headers.append(header_code.text)
|
||
|
all_headers.add(header_code.text)
|
||
|
# If the symbol was never named, consider all named headers.
|
||
|
return headers or all_headers
|
||
|
|
||
|
|
||
|
def _ParseIndexPage(index_page_html):
|
||
|
"""Parse index page.
|
||
|
The index page lists all std symbols and hrefs to their detailed pages
|
||
|
(which contain the defined header). An example:
|
||
|
|
||
|
<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
|
||
|
<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
|
||
|
|
||
|
Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
|
||
|
"""
|
||
|
symbols = []
|
||
|
soup = BeautifulSoup(index_page_html, "html.parser")
|
||
|
for symbol_href in soup.select("a[title]"):
|
||
|
# Ignore annotated symbols like "acos<>() (std::complex)".
|
||
|
# These tend to be overloads, and we the primary is more useful.
|
||
|
# This accidentally accepts begin/end despite the (iterator) caption: the
|
||
|
# (since C++11) note is first. They are good symbols, so the bug is unfixed.
|
||
|
caption = symbol_href.next_sibling
|
||
|
variant = isinstance(caption, NavigableString) and "(" in caption
|
||
|
symbol_tt = symbol_href.find("tt")
|
||
|
if symbol_tt:
|
||
|
symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
|
||
|
symbol_href["href"], variant))
|
||
|
return symbols
|
||
|
|
||
|
|
||
|
def _ReadSymbolPage(path, name):
|
||
|
with open(path) as f:
|
||
|
return _ParseSymbolPage(f.read(), name)
|
||
|
|
||
|
|
||
|
def _GetSymbols(pool, root_dir, index_page_name, namespace):
|
||
|
"""Get all symbols listed in the index page. All symbols should be in the
|
||
|
given namespace.
|
||
|
|
||
|
Returns a list of Symbols.
|
||
|
"""
|
||
|
|
||
|
# Workflow steps:
|
||
|
# 1. Parse index page which lists all symbols to get symbol
|
||
|
# name (unqualified name) and its href link to the symbol page which
|
||
|
# contains the defined header.
|
||
|
# 2. Parse the symbol page to get the defined header.
|
||
|
index_page_path = os.path.join(root_dir, index_page_name)
|
||
|
with open(index_page_path, "r") as f:
|
||
|
# Read each symbol page in parallel.
|
||
|
results = [] # (symbol_name, promise of [header...])
|
||
|
for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
|
||
|
# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
|
||
|
# FIXME: use these as a fallback rather than ignoring entirely.
|
||
|
if variant:
|
||
|
continue
|
||
|
path = os.path.join(root_dir, symbol_page_path)
|
||
|
results.append((symbol_name,
|
||
|
pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
|
||
|
|
||
|
# Build map from symbol name to a set of headers.
|
||
|
symbol_headers = collections.defaultdict(set)
|
||
|
for symbol_name, lazy_headers in results:
|
||
|
symbol_headers[symbol_name].update(lazy_headers.get())
|
||
|
|
||
|
symbols = []
|
||
|
for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
|
||
|
symbols.append(Symbol(name, namespace, list(headers)))
|
||
|
return symbols
|
||
|
|
||
|
|
||
|
def GetSymbols(parse_pages):
|
||
|
"""Get all symbols by parsing the given pages.
|
||
|
|
||
|
Args:
|
||
|
parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
|
||
|
"""
|
||
|
symbols = []
|
||
|
# Run many workers to process individual symbol pages under the symbol index.
|
||
|
# Don't allow workers to capture Ctrl-C.
|
||
|
pool = multiprocessing.Pool(
|
||
|
initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
|
||
|
try:
|
||
|
for root_dir, page_name, namespace in parse_pages:
|
||
|
symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
|
||
|
finally:
|
||
|
pool.terminate()
|
||
|
pool.join()
|
||
|
return symbols
|