[clangd] Using symbol name to map includes for STL symbols.

Summary:
Using suffix path mapping relies on the STL implementations, and it is
not portable. This patch is using symbol name mapping, which should
work with different STL implementations, fix clangd/clangd#9.

To generate the symbol mapping, we parse the cppreference symbol index
page to build a lookup table.

The mapping is not completed, a few TODOs:
  - support symbols from different headers (e.g. std::move)
  - support STL macros
  - support symbols from std's sub-namespaces (e.g. chrono)

Reviewers: ioeric, jfb, serge-sans-paille

Reviewed By: ioeric

Subscribers: sammccall, klimek, ilya-biryukov, ioeric, MaskRay, jkorous, mgrang, arphaman, kadircet, jfb, jdoerfert, cfe-commits

Tags: #clang-tools-extra, #clang

Differential Revision: https://reviews.llvm.org/D58345

llvm-svn: 356134
This commit is contained in:
Haojian Wu 2019-03-14 11:25:26 +00:00
parent e81f5f91b4
commit 7f51b5dc32
4 changed files with 1484 additions and 35 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,149 @@
#!/usr/bin/env python
#===- gen_std.py - ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C++ Standard Library symbols by parsing archieved HTML
files from cppreference.
Caveats and FIXMEs:
- only symbols directly in "std" namespace are added, we should also add std's
subnamespace symbols (e.g. chrono).
- symbols with multiple variants or defined in multiple headers aren't added,
e.g. std::move, std::swap
Usage:
1. Install BeautifulSoup dependency, see instruction:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
2. Download cppreference offline HTML files (e.g. html_book_20181028.zip) at
https://en.cppreference.com/w/Cppreference:Archives
3. Unzip the zip file from step 2 to directory </cppreference>, you should
get a "reference" directory in </cppreference>
4. Run the command:
gen_std.py -cppreference </cppreference/reference> > StdSymbolMap.inc
"""
from bs4 import BeautifulSoup
import argparse
import datetime
import os
import sys
STDGEN_CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for C++
// Standard Library symbols.
//
// Automatically generated file, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""
def ParseSymbolPage(symbol_page_html):
"""Parse symbol page and retrieve the include header defined in this page.
The symbol page provides header for the symbol, specifically in
"Defined in header <header>" section. An example:
<tr class="t-dsc-header">
<td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
</td></tr>
Returns a list of headers.
"""
headers = []
soup = BeautifulSoup(symbol_page_html, "html.parser")
# "Defined in header " are defined in <tr class="t-dsc-header"> or
# <tr class="t-dcl-header">.
for header_tr in soup.select('tr.t-dcl-header,tr.t-dsc-header'):
if "Defined in header " in header_tr.text:
# The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
for header_code in header_tr.find_all("code"):
headers.append(header_code.text)
return headers
def ParseIndexPage(index_page_html):
"""Parse index page.
The index page lists all std symbols and hrefs to their detailed pages
(which contain the defined header). An example:
<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
Returns a list of tuple (symbol_name, relative_path_to_symbol_page).
"""
symbols = []
soup = BeautifulSoup(index_page_html, "html.parser")
for symbol_href in soup.select("a[title]"):
symbol_tt = symbol_href.find("tt")
if symbol_tt:
symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
symbol_href["href"]))
return symbols
def ParseArg():
parser = argparse.ArgumentParser(description='Generate StdGen file')
parser.add_argument('-cppreference', metavar='PATH',
default='',
help='path to the cppreference offline HTML directory',
required=True
)
return parser.parse_args()
def main():
args = ParseArg()
cpp_reference_root = args.cppreference
cpp_symbol_root = os.path.join(cpp_reference_root, "en", "cpp")
index_page_path = os.path.join(cpp_symbol_root, "symbol_index.html")
if not os.path.exists(index_page_path):
exit("Path %s doesn't exist!" % index_page_path)
# We don't have version information from the unzipped offline HTML files.
# so we use the modified time of the symbol_index.html as the version.
cppreference_modified_date = datetime.datetime.fromtimestamp(
os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
# Workflow steps:
# 1. Parse index page which lists all symbols to get symbol
# name (unqualified name) and its href link to the symbol page which
# contains the defined header.
# 2. Parse the symbol page to get the defined header.
# A map from symbol name to a set of headers.
symbols = {}
with open(index_page_path, "r") as f:
for symbol_name, symbol_page_path in ParseIndexPage(f.read()):
with open(os.path.join(cpp_symbol_root, symbol_page_path), "r") as f:
headers = ParseSymbolPage(f.read())
if not headers:
sys.stderr.write("No header found for symbol %s at %s\n" % (symbol_name,
symbol_page_path))
continue
if symbol_name not in symbols:
symbols[symbol_name] = set()
symbols[symbol_name].update(headers)
# Emit results to stdout.
print STDGEN_CODE_PREFIX % cppreference_modified_date
for name, headers in sorted(symbols.items(), key=lambda t : t[0]):
if len(headers) > 1:
# FIXME: support symbols with multiple headers (e.g. std::move).
continue
# SYMBOL(unqualified_name, namespace, header)
print "SYMBOL(%s, %s, %s)" % (name, "std::", list(headers)[0])
if __name__ == '__main__':
main()

View File

@ -0,0 +1,101 @@
#!/usr/bin/env python
#===- test.py - ---------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
from gen_std import ParseSymbolPage, ParseIndexPage
import unittest
class TestStdGen(unittest.TestCase):
def testParseIndexPage(self):
html = """
<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
<a href="complex/abs.html" title="abs"><tt>abs&lt;&gt;()</tt></a> (std::complex) <br>
<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
<a href="acosh.html" title="acosh"><tt>acosh()</tt></a> <span class="t-mark-rev">(since C++11)</span> <br>
<a href="as_bytes.html" title="as bytes"><tt>as_bytes&lt;&gt;()</tt></a> <span class="t-mark-rev t-since-cxx20">(since C++20)</span> <br>
"""
actual = ParseIndexPage(html)
expected = [
("abs", "abs.html"),
("abs", "complex/abs.html"),
("acos", "acos.html"),
("acosh", "acosh.html"),
("as_bytes", "as_bytes.html"),
]
self.assertEqual(len(actual), len(expected))
for i in range(0, len(actual)):
self.assertEqual(expected[i][0], actual[i][0])
self.assertTrue(actual[i][1].endswith(expected[i][1]))
def testParseSymbolPage_SingleHeader(self):
# Defined in header <cmath>
html = """
<table class="t-dcl-begin"><tbody>
<tr class="t-dsc-header">
<td> <div>Defined in header <code><a href="cmath.html" title="cmath">&lt;cmath&gt;</a></code>
</div></td>
<td></td>
<td></td>
</tr>
</tbody></table>
"""
self.assertEqual(ParseSymbolPage(html), ['<cmath>'])
def testParseSymbolPage_MulHeaders(self):
# Defined in header <cstddef>
# Defined in header <cstdio>
# Defined in header <cstdlib>
html = """
<table class="t-dcl-begin"><tbody>
<tr class="t-dsc-header">
<td> <div>Defined in header <code><a href="cstddef.html" title="cstddef">&lt;cstddef&gt;</a></code>
</div></td>
<td></td>
<td></td>
</tr>
<tr class="t-dsc-header">
<td> <div>Defined in header <code><a href="cstdio.html" title="cstdio">&lt;cstdio&gt;</a></code>
</div></td>
<td></td>
<td></td>
</tr>
<tr class="t-dsc-header">
<td> <div>Defined in header <code><a href=".cstdlib.html" title="ccstdlib">&lt;cstdlib&gt;</a></code>
</div></td>
<td></td>
<td></td>
</tr>
</tbody></table>
"""
self.assertEqual(ParseSymbolPage(html),
['<cstddef>', '<cstdio>', '<cstdlib>'])
def testParseSymbolPage_MulHeadersInSameDiv(self):
# Multile <code> blocks in a Div.
# Defined in header <algorithm>
# Defined in header <utility>
html = """
<tr class="t-dsc-header">
<td><div>
Defined in header <code><a href="../header/algorithm.html" title="cpp/header/algorithm">&lt;algorithm&gt;</a></code><br>
Defined in header <code><a href="../header/utility.html" title="cpp/header/utility">&lt;utility&gt;</a></code>
</div></td>
<td></td>
</tr>
"""
self.assertEqual(ParseSymbolPage(html), ['<algorithm>', '<utility>'])
if __name__ == '__main__':
unittest.main()

View File

@ -107,57 +107,30 @@ collectIWYUHeaderMaps(CanonicalIncludes *Includes) {
void addSystemHeadersMapping(CanonicalIncludes *Includes) {
static const std::vector<std::pair<const char *, const char *>> SymbolMap = {
{"std::addressof", "<memory>"},
// Map symbols in <iosfwd> to their preferred includes.
{"std::basic_filebuf", "<fstream>"},
{"std::basic_fstream", "<fstream>"},
{"std::basic_ifstream", "<fstream>"},
{"std::basic_ofstream", "<fstream>"},
{"std::filebuf", "<fstream>"},
{"std::fstream", "<fstream>"},
{"std::ifstream", "<fstream>"},
{"std::ofstream", "<fstream>"},
{"std::wfilebuf", "<fstream>"},
{"std::wfstream", "<fstream>"},
{"std::wifstream", "<fstream>"},
{"std::wofstream", "<fstream>"},
{"std::basic_ios", "<ios>"},
{"std::ios", "<ios>"},
{"std::wios", "<ios>"},
{"std::basic_iostream", "<iostream>"},
{"std::iostream", "<iostream>"},
{"std::wiostream", "<iostream>"},
{"std::basic_istream", "<istream>"},
{"std::istream", "<istream>"},
{"std::wistream", "<istream>"},
{"std::istreambuf_iterator", "<iterator>"},
{"std::ostreambuf_iterator", "<iterator>"},
{"std::basic_ostream", "<ostream>"},
{"std::ostream", "<ostream>"},
{"std::wostream", "<ostream>"},
{"std::basic_istringstream", "<sstream>"},
{"std::basic_ostringstream", "<sstream>"},
{"std::basic_stringbuf", "<sstream>"},
{"std::basic_stringstream", "<sstream>"},
{"std::istringstream", "<sstream>"},
{"std::ostringstream", "<sstream>"},
{"std::string", "<string>"},
{"std::stringbuf", "<sstream>"},
{"std::stringstream", "<sstream>"},
{"std::wistringstream", "<sstream>"},
{"std::wostringstream", "<sstream>"},
{"std::wstringbuf", "<sstream>"},
{"std::wstringstream", "<sstream>"},
{"std::basic_streambuf", "<streambuf>"},
{"std::streambuf", "<streambuf>"},
{"std::wstreambuf", "<streambuf>"},
{"std::uint_least16_t", "<cstdint>"}, // <type_traits> redeclares these
{"std::uint_least32_t", "<cstdint>"},
{"std::declval", "<utility>"},
#define SYMBOL(Name, NameSpace, Header) { #NameSpace#Name, #Header },
#include "StdSymbolMap.inc"
#undef SYMBOL
};
for (const auto &Pair : SymbolMap)
Includes->addSymbolMapping(Pair.first, Pair.second);
// FIXME: remove the std header mapping once we support ambiguous symbols, now
// it serves as a fallback to disambiguate:
// - symbols with mulitiple headers (e.g. std::move)
// - symbols with a primary template in one header and a specialization in
// another (std::abs)
static const std::vector<std::pair<const char *, const char *>>
SystemHeaderMap = {
{"include/__stddef_max_align_t.h", "<cstddef>"},