llvm-project/clang-tools-extra/clangd/IncludeCleaner.cpp

//===--- IncludeCleaner.cpp - Unused/Missing Headers Analysis ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "IncludeCleaner.h"
#include "Config.h"
#include "Headers.h"
#include "ParsedAST.h"
#include "Protocol.h"
#include "SourceCode.h"
#include "support/Logger.h"
#include "support/Trace.h"
#include "clang/AST/ExprCXX.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Tooling/Syntax/Tokens.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Path.h"

namespace clang {
namespace clangd {

static bool AnalyzeStdlib = false;
void setIncludeCleanerAnalyzesStdlib(bool B) { AnalyzeStdlib = B; }

namespace {

/// Crawler traverses the AST and feeds in the locations of (sometimes
/// implicitly) used symbols into \p Result.
class ReferencedLocationCrawler
    : public RecursiveASTVisitor<ReferencedLocationCrawler> {
public:
  ReferencedLocationCrawler(ReferencedLocations &Result,
                            const SourceManager &SM)
      : Result(Result), SM(SM) {}

  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
    add(DRE->getDecl());
    add(DRE->getFoundDecl());
    return true;
  }

  bool VisitMemberExpr(MemberExpr *ME) {
    add(ME->getMemberDecl());
    add(ME->getFoundDecl().getDecl());
    return true;
  }

  bool VisitTagType(TagType *TT) {
    add(TT->getDecl());
    return true;
  }

  bool VisitFunctionDecl(FunctionDecl *FD) {
    // Function definition will require redeclarations to be included.
    if (FD->isThisDeclarationADefinition())
      add(FD);
    return true;
  }

  bool VisitCXXConstructExpr(CXXConstructExpr *CCE) {
    add(CCE->getConstructor());
    return true;
  }

  bool VisitTemplateSpecializationType(TemplateSpecializationType *TST) {
    if (isNew(TST)) {
      add(TST->getTemplateName().getAsTemplateDecl()); // Primary template.
      add(TST->getAsCXXRecordDecl());                  // Specialization
    }
    return true;
  }

  bool VisitUsingType(UsingType *UT) {
    add(UT->getFoundDecl());
    return true;
  }

  bool VisitTypedefType(TypedefType *TT) {
    add(TT->getDecl());
    return true;
  }

  // Consider types of any subexpression used, even if the type is not named.
  // This is helpful in getFoo().bar(), where Foo must be complete.
  // FIXME(kirillbobyrev): Should we tweak this? It may not be desirable to
  // consider types "used" when they are not directly spelled in code.
  bool VisitExpr(Expr *E) {
    TraverseType(E->getType());
    return true;
  }

  bool TraverseType(QualType T) {
    if (isNew(T.getTypePtrOrNull())) // don't care about quals
      Base::TraverseType(T);
    return true;
  }

  bool VisitUsingDecl(UsingDecl *D) {
    for (const auto *Shadow : D->shadows())
      add(Shadow->getTargetDecl());
    return true;
  }

  // Enums may be usefully forward-declared as *complete* types by specifying
  // an underlying type. In this case, the definition should see the declaration
  // so they can be checked for compatibility.
  bool VisitEnumDecl(EnumDecl *D) {
    if (D->isThisDeclarationADefinition() && D->getIntegerTypeSourceInfo())
      add(D);
    return true;
  }

  // When the overload is not resolved yet, mark all candidates as used.
  bool VisitOverloadExpr(OverloadExpr *E) {
    for (const auto *ResolutionDecl : E->decls())
      add(ResolutionDecl);
    return true;
  }

private:
  using Base = RecursiveASTVisitor<ReferencedLocationCrawler>;

  void add(const Decl *D) {
    if (!D || !isNew(D->getCanonicalDecl()))
      return;
    if (auto SS = StdRecognizer(D)) {
      Result.Stdlib.insert(*SS);
      return;
    }
    // Special case RecordDecls, as it is common for them to be forward
    // declared multiple times. The most common cases are:
    // - Definition available in TU, only mark that one as usage. The rest is
    //   likely to be unnecessary. This might result in false positives when an
    //   internal definition is visible.
    // - There's a forward declaration in the main file, no need for other
    //   redecls.
    if (const auto *RD = llvm::dyn_cast<RecordDecl>(D)) {
      if (const auto *Definition = RD->getDefinition()) {
        Result.User.insert(Definition->getLocation());
        return;
      }
      if (SM.isInMainFile(RD->getMostRecentDecl()->getLocation()))
        return;
    }
    for (const Decl *Redecl : D->redecls())
      Result.User.insert(Redecl->getLocation());
  }

  bool isNew(const void *P) { return P && Visited.insert(P).second; }

  ReferencedLocations &Result;
  llvm::DenseSet<const void *> Visited;
  const SourceManager &SM;
  stdlib::Recognizer StdRecognizer;
};

// Given a set of referenced FileIDs, determines all the potentially-referenced
// files and macros by traversing expansion/spelling locations of macro IDs.
// This is used to map the referenced SourceLocations onto real files.
struct ReferencedFilesBuilder {
  ReferencedFilesBuilder(const SourceManager &SM) : SM(SM) {}
  llvm::DenseSet<FileID> Files;
  llvm::DenseSet<FileID> Macros;
  const SourceManager &SM;

  void add(SourceLocation Loc) { add(SM.getFileID(Loc), Loc); }

  void add(FileID FID, SourceLocation Loc) {
    if (FID.isInvalid())
      return;
    assert(SM.isInFileID(Loc, FID));
    if (Loc.isFileID()) {
      Files.insert(FID);
      return;
    }
    // Don't process the same macro FID twice.
    if (!Macros.insert(FID).second)
      return;
    const auto &Exp = SM.getSLocEntry(FID).getExpansion();
    add(Exp.getSpellingLoc());
    add(Exp.getExpansionLocStart());
    add(Exp.getExpansionLocEnd());
  }
};

// Returns the range starting at '#' and ending at EOL. Escaped newlines are not
// handled.
clangd::Range getDiagnosticRange(llvm::StringRef Code, unsigned HashOffset) {
  clangd::Range Result;
  Result.end = Result.start = offsetToPosition(Code, HashOffset);

  // Span the warning until the EOL or EOF.
  Result.end.character +=
      lspLength(Code.drop_front(HashOffset).take_until([](char C) {
        return C == '\n' || C == '\r';
      }));
  return Result;
}

// Finds locations of macros referenced from within the main file. That includes
// references that were not yet expanded, e.g `BAR` in `#define FOO BAR`.
void findReferencedMacros(ParsedAST &AST, ReferencedLocations &Result) {
  trace::Span Tracer("IncludeCleaner::findReferencedMacros");
  auto &SM = AST.getSourceManager();
  auto &PP = AST.getPreprocessor();
  // FIXME(kirillbobyrev): The macros from the main file are collected in
  // ParsedAST's MainFileMacros. However, we can't use it here because it
  // doesn't handle macro references that were not expanded, e.g. in macro
  // definitions or preprocessor-disabled sections.
  //
  // Extending MainFileMacros to collect missing references and switching to
  // this mechanism (as opposed to iterating through all tokens) will improve
  // the performance of findReferencedMacros and also improve other features
  // relying on MainFileMacros.
  for (const syntax::Token &Tok :
       AST.getTokens().spelledTokens(SM.getMainFileID())) {
    auto Macro = locateMacroAt(Tok, PP);
    if (!Macro)
      continue;
    auto Loc = Macro->Info->getDefinitionLoc();
    if (Loc.isValid())
      Result.User.insert(Loc);
    // FIXME: support stdlib macros
  }
}

static bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST) {
  if (Inc.BehindPragmaKeep)
    return false;

  // FIXME(kirillbobyrev): We currently do not support the umbrella headers.
  // System headers are likely to be standard library headers.
  // Until we have good support for umbrella headers, don't warn about them.
  if (Inc.Written.front() == '<') {
    if (AnalyzeStdlib && stdlib::Header::named(Inc.Written))
      return true;
    return false;
  }
  // Headers without include guards have side effects and are not
  // self-contained, skip them.
  assert(Inc.HeaderID);
  auto FE = AST.getSourceManager().getFileManager().getFile(
      AST.getIncludeStructure().getRealPath(
          static_cast<IncludeStructure::HeaderID>(*Inc.HeaderID)));
  assert(FE);
  if (!AST.getPreprocessor().getHeaderSearchInfo().isFileMultipleIncludeGuarded(
          *FE)) {
    dlog("{0} doesn't have header guard and will not be considered unused",
         (*FE)->getName());
    return false;
  }
  return true;
}

// In case symbols are coming from non self-contained header, we need to find
// its first includer that is self-contained. This is the header users can
// include, so it will be responsible for bringing the symbols from given
// header into the scope.
FileID headerResponsible(FileID ID, const SourceManager &SM,
                         const IncludeStructure &Includes) {
  // Unroll the chain of non self-contained headers until we find the one that
  // can be included.
  for (const FileEntry *FE = SM.getFileEntryForID(ID); ID != SM.getMainFileID();
       FE = SM.getFileEntryForID(ID)) {
    // If FE is nullptr, we consider it to be the responsible header.
    if (!FE)
      break;
    auto HID = Includes.getID(FE);
    assert(HID && "We're iterating over headers already existing in "
                  "IncludeStructure");
    if (Includes.isSelfContained(*HID))
      break;
    // The header is not self-contained: put the responsibility for its symbols
    // on its includer.
    ID = SM.getFileID(SM.getIncludeLoc(ID));
  }
  return ID;
}

} // namespace

ReferencedLocations findReferencedLocations(ParsedAST &AST) {
  trace::Span Tracer("IncludeCleaner::findReferencedLocations");
  ReferencedLocations Result;
  ReferencedLocationCrawler Crawler(Result, AST.getSourceManager());
  Crawler.TraverseAST(AST.getASTContext());
  findReferencedMacros(AST, Result);
  return Result;
}

ReferencedFiles findReferencedFiles(const ReferencedLocations &Locs,
                                    const IncludeStructure &Includes,
                                    const SourceManager &SM) {
  std::vector<SourceLocation> Sorted{Locs.User.begin(), Locs.User.end()};
  llvm::sort(Sorted); // Group by FileID.
  ReferencedFilesBuilder Builder(SM);
  for (auto It = Sorted.begin(); It < Sorted.end();) {
    FileID FID = SM.getFileID(*It);
    Builder.add(FID, *It);
    // Cheaply skip over all the other locations from the same FileID.
    // This avoids lots of redundant Loc->File lookups for the same file.
    do
      ++It;
    while (It != Sorted.end() && SM.isInFileID(*It, FID));
  }

  // If a header is not self-contained, we consider its symbols a logical part
  // of the including file. Therefore, mark the parents of all used
  // non-self-contained FileIDs as used. Perform this on FileIDs rather than
  // HeaderIDs, as each inclusion of a non-self-contained file is distinct.
  llvm::DenseSet<FileID> UserFiles;
  for (FileID ID : Builder.Files)
    UserFiles.insert(headerResponsible(ID, SM, Includes));

  llvm::DenseSet<stdlib::Header> StdlibFiles;
  for (const auto &Symbol : Locs.Stdlib)
    for (const auto &Header : Symbol.headers())
      StdlibFiles.insert(Header);

  return {std::move(UserFiles), std::move(StdlibFiles)};
}

std::vector<const Inclusion *>
getUnused(ParsedAST &AST,
          const llvm::DenseSet<IncludeStructure::HeaderID> &ReferencedFiles) {
  trace::Span Tracer("IncludeCleaner::getUnused");
  std::vector<const Inclusion *> Unused;
  for (const Inclusion &MFI : AST.getIncludeStructure().MainFileIncludes) {
    if (!MFI.HeaderID)
      continue;
    auto IncludeID = static_cast<IncludeStructure::HeaderID>(*MFI.HeaderID);
    bool Used = ReferencedFiles.contains(IncludeID);
    if (!Used && !mayConsiderUnused(MFI, AST)) {
      dlog("{0} was not used, but is not eligible to be diagnosed as unused",
           MFI.Written);
      continue;
    }
    if (!Used)
      Unused.push_back(&MFI);
    dlog("{0} is {1}", MFI.Written, Used ? "USED" : "UNUSED");
  }
  return Unused;
}

#ifndef NDEBUG
// Is FID a <built-in>, <scratch space> etc?
static bool isSpecialBuffer(FileID FID, const SourceManager &SM) {
  const SrcMgr::FileInfo &FI = SM.getSLocEntry(FID).getFile();
  return FI.getName().startswith("<");
}
#endif

llvm::DenseSet<IncludeStructure::HeaderID>
translateToHeaderIDs(const ReferencedFiles &Files,
                     const IncludeStructure &Includes,
                     const SourceManager &SM) {
  trace::Span Tracer("IncludeCleaner::translateToHeaderIDs");
  llvm::DenseSet<IncludeStructure::HeaderID> TranslatedHeaderIDs;
  TranslatedHeaderIDs.reserve(Files.User.size());
  for (FileID FID : Files.User) {
    const FileEntry *FE = SM.getFileEntryForID(FID);
    if (!FE) {
      assert(isSpecialBuffer(FID, SM));
      continue;
    }
    const auto File = Includes.getID(FE);
    assert(File);
    TranslatedHeaderIDs.insert(*File);
  }
  for (stdlib::Header StdlibUsed : Files.Stdlib)
    for (auto HID : Includes.StdlibHeaders.lookup(StdlibUsed))
      TranslatedHeaderIDs.insert(HID);
  return TranslatedHeaderIDs;
}

std::vector<const Inclusion *> computeUnusedIncludes(ParsedAST &AST) {
  const auto &SM = AST.getSourceManager();

  auto Refs = findReferencedLocations(AST);
  auto ReferencedFileIDs = findReferencedFiles(Refs, AST.getIncludeStructure(),
                                               AST.getSourceManager());
  auto ReferencedHeaders =
      translateToHeaderIDs(ReferencedFileIDs, AST.getIncludeStructure(), SM);
  return getUnused(AST, ReferencedHeaders);
}

std::vector<Diag> issueUnusedIncludesDiagnostics(ParsedAST &AST,
                                                 llvm::StringRef Code) {
  const Config &Cfg = Config::current();
  if (Cfg.Diagnostics.UnusedIncludes != Config::UnusedIncludesPolicy::Strict ||
      Cfg.Diagnostics.SuppressAll ||
      Cfg.Diagnostics.Suppress.contains("unused-includes"))
    return {};
  trace::Span Tracer("IncludeCleaner::issueUnusedIncludesDiagnostics");
  std::vector<Diag> Result;
  std::string FileName =
      AST.getSourceManager()
          .getFileEntryForID(AST.getSourceManager().getMainFileID())
          ->getName()
          .str();
  for (const auto *Inc : computeUnusedIncludes(AST)) {
    Diag D;
    D.Message =
        llvm::formatv("included header {0} is not used",
                      llvm::sys::path::filename(
                          Inc->Written.substr(1, Inc->Written.size() - 2),
                          llvm::sys::path::Style::posix));
    D.Name = "unused-includes";
    D.Source = Diag::DiagSource::Clangd;
    D.File = FileName;
    D.Severity = DiagnosticsEngine::Warning;
    D.Tags.push_back(Unnecessary);
    D.Range = getDiagnosticRange(Code, Inc->HashOffset);
    // FIXME(kirillbobyrev): Removing inclusion might break the code if the
    // used headers are only reachable transitively through this one. Suggest
    // including them directly instead.
    // FIXME(kirillbobyrev): Add fix suggestion for adding IWYU pragmas
    // (keep/export) remove the warning once we support IWYU pragmas.
    D.Fixes.emplace_back();
    D.Fixes.back().Message = "remove #include directive";
    D.Fixes.back().Edits.emplace_back();
    D.Fixes.back().Edits.back().range.start.line = Inc->HashLine;
    D.Fixes.back().Edits.back().range.end.line = Inc->HashLine + 1;
    D.InsideMainFile = true;
    Result.push_back(std::move(D));
  }
  return Result;
}

} // namespace clangd
} // namespace clang