[include-cleaner] Include-cleaner library structure, and simplistic AST walking.

Include-cleaner is a library that uses the clang AST and preprocessor to
determine which headers are used. It will be used in clang-tidy, in
clangd, in a standalone tool at least for testing, and in out-of-tree tools.

Roughly, it walks the AST, finds referenced decls, maps these to
used sourcelocations, then to FileEntrys, then matching these against #includes.
However there are many wrinkles: dealing with macros, standard library
symbols, umbrella headers, IWYU directives etc.

It is not built on the C++20 modules concept of usage, to allow:
 - use with existing non-modules codebases
 - a flexible API embeddable in clang-tidy, clangd, and other tools
 - avoiding a chicken-and-egg problem where include cleanups are needed
   before modules can be adopted

This library is based on existing functionality in clangd that provides
an unused-include warning. However it has design changes:
 - it accommodates diagnosing missing includes too (this means tracking
   where references come from, not just the set of targets)
 - it more clearly separates the different mappings
   (symbol => location => header => include) for better testing
 - it handles special cases like standard library symbols and IWYU directives
   more elegantly by adding unified Location and Header types instead of
   side-tables
 - it will support some customization of policy where necessary (e.g.
   for style questions of what constitutes a use, or to allow
   both missing-include and unused-include modes to be conservative)

This patch adds the basic directory structure under clang-tools-extra
and a skeleton version of the AST traversal, which will be the central
piece.
A more end-to-end prototype is in https://reviews.llvm.org/D122677

RFC: https://discourse.llvm.org/t/rfc-lifting-include-cleaner-missing-unused-include-detection-out-of-clangd/61228

Differential Revision: https://reviews.llvm.org/D124164
This commit is contained in:
Sam McCall 2022-04-13 21:13:34 +02:00
parent 14869bd2df
commit 41ac245c10
15 changed files with 336 additions and 0 deletions

View File

@ -14,6 +14,7 @@ add_subdirectory(clang-doc)
add_subdirectory(clang-include-fixer)
add_subdirectory(clang-move)
add_subdirectory(clang-query)
add_subdirectory(include-cleaner)
add_subdirectory(pp-trace)
add_subdirectory(pseudo)
add_subdirectory(tool-template)

View File

@ -0,0 +1,5 @@
add_subdirectory(lib)
if(CLANG_INCLUDE_TESTS)
add_subdirectory(test)
add_subdirectory(unittests)
endif()

View File

@ -0,0 +1,47 @@
//===--- AnalysisInternal.h - Analysis building blocks ------------- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides smaller, testable pieces of the used-header analysis.
// We find the headers by chaining together several mappings.
//
// AST => AST node => Symbol => Location => Header
// /
// Macro expansion =>
//
// The individual steps are declared here.
// (AST => AST Node => Symbol is one API to avoid materializing DynTypedNodes).
//
//===----------------------------------------------------------------------===//
#ifndef CLANG_INCLUDE_CLEANER_ANALYSISINTERNAL_H
#define CLANG_INCLUDE_CLEANER_ANALYSISINTERNAL_H
#include "clang/Basic/SourceLocation.h"
#include "llvm/ADT/STLFunctionalExtras.h"
namespace clang {
class Decl;
class NamedDecl;
namespace include_cleaner {
/// Traverses part of the AST from \p Root, finding uses of symbols.
///
/// Each use is reported to the callback:
/// - the SourceLocation describes where the symbol was used. This is usually
/// the primary location of the AST node found under Root.
/// - the NamedDecl is the symbol referenced. It is canonical, rather than e.g.
/// the redecl actually found by lookup.
///
/// walkAST is typically called once per top-level declaration in the file
/// being analyzed, in order to find all references within it.
void walkAST(Decl &Root, llvm::function_ref<void(SourceLocation, NamedDecl &)>);
} // namespace include_cleaner
} // namespace clang
#endif

View File

@ -0,0 +1,10 @@
set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangIncludeCleaner
WalkAST.cpp
LINK_LIBS
clangBasic
clangAST
)

View File

@ -0,0 +1,47 @@
//===--- WalkAST.cpp - Find declaration references in the AST -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AnalysisInternal.h"
#include "clang/AST/RecursiveASTVisitor.h"
namespace clang {
namespace include_cleaner {
namespace {
using DeclCallback = llvm::function_ref<void(SourceLocation, NamedDecl &)>;
class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
DeclCallback Callback;
void report(SourceLocation Loc, NamedDecl *ND) {
if (!ND || Loc.isInvalid())
return;
Callback(Loc, *cast<NamedDecl>(ND->getCanonicalDecl()));
}
public:
ASTWalker(DeclCallback Callback) : Callback(Callback) {}
bool VisitTagTypeLoc(TagTypeLoc TTL) {
report(TTL.getNameLoc(), TTL.getDecl());
return true;
}
bool VisitDeclRefExpr(DeclRefExpr *DRE) {
report(DRE->getLocation(), DRE->getFoundDecl());
return true;
}
};
} // namespace
void walkAST(Decl &Root, DeclCallback Callback) {
ASTWalker(Callback).TraverseDecl(&Root);
}
} // namespace include_cleaner
} // namespace clang

View File

@ -0,0 +1,25 @@
set(CLANG_INCLUDE_CLEANER_TEST_DEPS
ClangIncludeCleanerTests
)
foreach (dep FileCheck not count)
if(TARGET ${dep})
list(APPEND CLANG_INCLUDE_CLEANER_TEST_DEPS ${dep})
endif()
endforeach()
configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
MAIN_CONFIG
${CMAKE_CURRENT_BINARY_DIR}/lit.cfg.py)
configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
MAIN_CONFIG
${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.cfg.py)
add_lit_testsuite(check-clang-include-cleaner "Running the clang-include-cleaner regression tests"
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ${CLANG_INCLUDE_CLEANER_TEST_DEPS})

View File

@ -0,0 +1,18 @@
import lit.formats
config.name = "clangIncludeCleaner Unit Tests"
config.test_format = lit.formats.GoogleTest('.', 'Tests')
config.test_source_root = config.clang_include_cleaner_binary_dir + "/unittests"
config.test_exec_root = config.clang_include_cleaner_binary_dir + "/unittests"
# Point the dynamic loader at dynamic libraries in 'lib'.
# FIXME: it seems every project has a copy of this logic. Move it somewhere.
import platform
if platform.system() == 'Darwin':
shlibpath_var = 'DYLD_LIBRARY_PATH'
elif platform.system() == 'Windows':
shlibpath_var = 'PATH'
else:
shlibpath_var = 'LD_LIBRARY_PATH'
config.environment[shlibpath_var] = os.path.pathsep.join((
"@SHLIBDIR@", "@LLVM_LIBS_DIR@",
config.environment.get(shlibpath_var,'')))

View File

@ -0,0 +1,10 @@
@LIT_SITE_CFG_IN_HEADER@
# This is a shim to run the gtest unittests in ../unittests using lit.
config.llvm_libs_dir = path("@LLVM_LIBS_DIR@")
config.shlibdir = path("@SHLIBDIR@")
config.clang_include_cleaner_binary_dir = path("@CMAKE_CURRENT_BINARY_DIR@/..")
# Delegate logic to lit.cfg.py.
lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/Unit/lit.cfg.py")

View File

@ -0,0 +1,16 @@
import lit.llvm
lit.llvm.initialize(lit_config, config)
lit.llvm.llvm_config.use_default_substitutions()
config.name = 'ClangIncludeCleaner'
config.suffixes = ['.test', '.c', '.cpp']
config.excludes = ['Inputs']
config.test_format = lit.formats.ShTest(not lit.llvm.llvm_config.use_lit_shell)
config.test_source_root = config.clang_include_cleaner_source_dir + "/test"
config.test_exec_root = config.clang_include_cleaner_binary_dir + "/test"
config.environment['PATH'] = os.path.pathsep.join((
config.clang_tools_dir,
config.llvm_tools_dir,
config.environment['PATH']))

View File

@ -0,0 +1,14 @@
@LIT_SITE_CFG_IN_HEADER@
# Variables needed for common llvm config.
config.clang_tools_dir = path("@CURRENT_TOOLS_DIR@")
config.lit_tools_dir = path("@LLVM_LIT_TOOLS_DIR@")
config.llvm_tools_dir = path(lit_config.substitute("@LLVM_TOOLS_DIR@"))
config.llvm_libs_dir = path(lit_config.substitute("@LLVM_LIBS_DIR@"))
config.target_triple = "@TARGET_TRIPLE@"
config.python_executable = "@Python3_EXECUTABLE@"
config.clang_include_cleaner_source_dir = path("@CMAKE_CURRENT_SOURCE_DIR@/..")
config.clang_include_cleaner_binary_dir = path("@CMAKE_CURRENT_BINARY_DIR@/..")
# Delegate logic to lit.cfg.py.
lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")

View File

@ -0,0 +1,25 @@
set(LLVM_LINK_COMPONENTS
Support
TestingSupport
)
add_custom_target(ClangIncludeCleanerUnitTests)
add_unittest(ClangIncludeCleanerUnitTests ClangIncludeCleanerTests
WalkASTTest.cpp
)
target_include_directories(ClangIncludeCleanerTests
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../lib)
clang_target_link_libraries(ClangIncludeCleanerTests
PRIVATE
clangBasic
)
target_link_libraries(ClangIncludeCleanerTests
PRIVATE
clangIncludeCleaner
clangTesting
)

View File

@ -0,0 +1,110 @@
#include "AnalysisInternal.h"
#include "clang/AST/ASTContext.h"
#include "clang/Basic/FileManager.h"
#include "clang/Frontend/TextDiagnostic.h"
#include "clang/Testing/TestAST.h"
#include "llvm/Support/Error.h"
#include "llvm/Testing/Support/Annotations.h"
#include "gtest/gtest.h"
namespace clang {
namespace include_cleaner {
namespace {
// Specifies a test of which symbols are referenced by a piece of code.
//
// Example:
// Target: int ^foo();
// Referencing: int x = ^foo();
// There must be exactly one referencing location marked.
void testWalk(llvm::StringRef TargetCode, llvm::StringRef ReferencingCode) {
llvm::Annotations Target(TargetCode);
llvm::Annotations Referencing(ReferencingCode);
TestInputs Inputs(Referencing.code());
Inputs.ExtraFiles["target.h"] = Target.code().str();
Inputs.ExtraArgs.push_back("-include");
Inputs.ExtraArgs.push_back("target.h");
TestAST AST(Inputs);
const auto &SM = AST.sourceManager();
// We're only going to record references from the nominated point,
// to the target file.
FileID ReferencingFile = SM.getMainFileID();
SourceLocation ReferencingLoc =
SM.getComposedLoc(ReferencingFile, Referencing.point());
FileID TargetFile = SM.translateFile(
llvm::cantFail(AST.fileManager().getFileRef("target.h")));
// Perform the walk, and capture the offsets of the referenced targets.
std::vector<size_t> ReferencedOffsets;
for (Decl *D : AST.context().getTranslationUnitDecl()->decls()) {
if (ReferencingFile != SM.getDecomposedExpansionLoc(D->getLocation()).first)
continue;
walkAST(*D, [&](SourceLocation Loc, NamedDecl &ND) {
if (SM.getFileLoc(Loc) != ReferencingLoc)
return;
auto NDLoc = SM.getDecomposedLoc(SM.getFileLoc(ND.getLocation()));
if (NDLoc.first != TargetFile)
return;
ReferencedOffsets.push_back(NDLoc.second);
});
}
llvm::sort(ReferencedOffsets);
// Compare results to the expected points.
// For each difference, show the target point in context, like a diagnostic.
std::string DiagBuf;
llvm::raw_string_ostream DiagOS(DiagBuf);
auto *DiagOpts = new DiagnosticOptions();
DiagOpts->ShowLevel = 0;
DiagOpts->ShowNoteIncludeStack = 0;
TextDiagnostic Diag(DiagOS, AST.context().getLangOpts(), DiagOpts);
auto DiagnosePoint = [&](const char *Message, unsigned Offset) {
Diag.emitDiagnostic(
FullSourceLoc(SM.getComposedLoc(TargetFile, Offset), SM),
DiagnosticsEngine::Note, Message, {}, {});
};
for (auto Expected : Target.points())
if (!llvm::is_contained(ReferencedOffsets, Expected))
DiagnosePoint("location not marked used", Expected);
for (auto Actual : ReferencedOffsets)
if (!llvm::is_contained(Target.points(), Actual))
DiagnosePoint("location unexpectedly used", Actual);
// If there were any differences, we print the entire referencing code once.
if (!DiagBuf.empty())
ADD_FAILURE() << DiagBuf << "\nfrom code:\n" << ReferencingCode;
}
TEST(WalkAST, DeclRef) {
testWalk("int ^x;", "int y = ^x;");
testWalk("int ^foo();", "int y = ^foo();");
testWalk("namespace ns { int ^x; }", "int y = ns::^x;");
testWalk("struct S { static int ^x; };", "int y = S::^x;");
// Canonical declaration only.
testWalk("extern int ^x; int x;", "int y = ^x;");
}
TEST(WalkAST, TagType) {
testWalk("struct ^S {};", "^S *y;");
testWalk("enum ^E {};", "^E *y;");
testWalk("struct ^S { static int x; };", "int y = ^S::x;");
}
TEST(WalkAST, Alias) {
testWalk(R"cpp(
namespace ns { int x; }
using ns::^x;
)cpp",
"int y = ^x;");
testWalk(R"cpp(
namespace ns { struct S; } // Not used
using ns::S; // FIXME: S should be used
)cpp",
"^S *x;");
}
} // namespace
} // namespace include_cleaner
} // namespace clang

View File

@ -45,6 +45,10 @@ struct TestInputs {
/// Extra argv to pass to clang -cc1.
std::vector<std::string> ExtraArgs = {};
/// Extra virtual files that are available to be #included.
/// Keys are plain filenames ("foo.h"), values are file content.
llvm::StringMap<std::string> ExtraFiles = {};
/// By default, error diagnostics during parsing are reported as gtest errors.
/// To suppress this, set ErrorOK or include "error-ok" in a comment in Code.
/// In either case, all diagnostics appear in TestAST::diagnostics().

View File

@ -105,6 +105,10 @@ TestAST::TestAST(const TestInputs &In) {
auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
VFS->addFile(Filename, /*ModificationTime=*/0,
llvm::MemoryBuffer::getMemBufferCopy(In.Code, Filename));
for (const auto &Extra : In.ExtraFiles)
VFS->addFile(
Extra.getKey(), /*ModificationTime=*/0,
llvm::MemoryBuffer::getMemBufferCopy(Extra.getValue(), Extra.getKey()));
Clang->createFileManager(VFS);
// Running the FrontendAction creates the other components: SourceManager,