From 02871701400253a49de502a5fef770f92772f6bc Mon Sep 17 00:00:00 2001 From: Ben Langmuir Date: Fri, 15 Jul 2022 10:26:19 -0700 Subject: [PATCH] [clang][deps] Include canonical invocation in ContextHash The "strict context hash" is insufficient to identify module dependencies during scanning, leading to different module build commands being produced for a single module, and non-deterministically choosing between them. This commit switches to hashing the canonicalized `CompilerInvocation` of the module. By hashing the invocation we are converting these from correctness issues to performance issues, and we can then incrementally improve our ability to canonicalize command-lines. This change can cause a regression in the number of modules needed. Of the 4 projects I tested, 3 had no regression, but 1, which was clang+llvm itself, had a 66% regression in number of modules (4% regression in total invocations). This is almost entirely due to differences between -W options across targets. Of this, 25% of the additional modules are system modules, which we could avoid if we canonicalized -W options when -Wsystem-headers is not present -- unfortunately this is non-trivial due to some warnings being enabled in system headers by default. The rest of the additional modules are mostly real differences in potential warnings, reflecting incorrect behaviour in the current scanner. There were also a couple of differences due to `-DFOO` `-fmodule-ignore-macro=FOO`, which I fixed here. Since the output paths for the module depend on its context hash, we hash the invocation before filling in outputs, and rely on the build system to always return the same output paths for a given module. Note: since the scanner itself uses an implicit modules build, there can still be non-determinism, but it will now present as different module+hashes rather than different command-lines for the same module+hash. Differential Revision: https://reviews.llvm.org/D129884 --- .../DependencyScanning/ModuleDepCollector.h | 8 +- .../DependencyScanning/ModuleDepCollector.cpp | 63 ++++++++++- .../modules-context-hash-ignore-macros.c | 100 ++++++++++++++++++ .../modules-context-hash-module-map-path.c | 77 ++++++++++++++ .../modules-context-hash-outputs.c | 77 ++++++++++++++ .../modules-context-hash-warnings.c | 77 ++++++++++++++ 6 files changed, 396 insertions(+), 6 deletions(-) create mode 100644 clang/test/ClangScanDeps/modules-context-hash-ignore-macros.c create mode 100644 clang/test/ClangScanDeps/modules-context-hash-module-map-path.c create mode 100644 clang/test/ClangScanDeps/modules-context-hash-outputs.c create mode 100644 clang/test/ClangScanDeps/modules-context-hash-warnings.c diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h index 05c9f56b4cf6..377dcd5a68fe 100644 --- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h +++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h @@ -46,9 +46,11 @@ struct ModuleID { /// or a header-name for C++20 header units. std::string ModuleName; - /// The context hash of a module represents the set of compiler options that - /// may make one version of a module incompatible with another. This includes - /// things like language mode, predefined macros, header search paths, etc... + /// The context hash of a module represents the compiler options that affect + /// the resulting command-line invocation. + /// + /// Modules with the same name and ContextHash but different invocations could + /// cause non-deterministic build results. /// /// Modules with the same name but a different \c ContextHash should be /// treated as separate modules for the purpose of a build. diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 725bb2c318ac..8d58a53a5971 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -12,6 +12,7 @@ #include "clang/Frontend/CompilerInstance.h" #include "clang/Lex/Preprocessor.h" #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h" +#include "llvm/Support/BLAKE3.h" #include "llvm/Support/StringSaver.h" using namespace clang; @@ -77,6 +78,19 @@ CompilerInvocation ModuleDepCollector::makeInvocationForModuleBuildWithoutPaths( CI.getHeaderSearchOpts().ModuleCachePruneInterval = 7 * 24 * 60 * 60; CI.getHeaderSearchOpts().ModuleCachePruneAfter = 31 * 24 * 60 * 60; + // Remove any macro definitions that are explicitly ignored. + if (!CI.getHeaderSearchOpts().ModulesIgnoreMacros.empty()) { + llvm::erase_if( + CI.getPreprocessorOpts().Macros, + [&CI](const std::pair &Def) { + StringRef MacroDef = Def.first; + return CI.getHeaderSearchOpts().ModulesIgnoreMacros.contains( + llvm::CachedHashString(MacroDef.split('=').first)); + }); + // Remove the now unused option. + CI.getHeaderSearchOpts().ModulesIgnoreMacros.clear(); + } + // Report the prebuilt modules this module uses. for (const auto &PrebuiltModule : Deps.PrebuiltModuleDeps) CI.getFrontendOpts().ModuleFiles.push_back(PrebuiltModule.PCMFile); @@ -156,6 +170,50 @@ std::vector ModuleDeps::getCanonicalCommandLine( return serializeCompilerInvocation(CI); } +static std::string getModuleContextHash(const ModuleDeps &MD) { + llvm::HashBuilder, + llvm::support::endianness::native> + HashBuilder; + SmallString<32> Scratch; + + // Hash the compiler version and serialization version to ensure the module + // will be readable. + HashBuilder.add(getClangFullRepositoryVersion()); + HashBuilder.add(serialization::VERSION_MAJOR, serialization::VERSION_MINOR); + + // Hash the BuildInvocation without any input files. + SmallVector DummyArgs; + MD.BuildInvocation.generateCC1CommandLine(DummyArgs, [&](const Twine &Arg) { + Scratch.clear(); + StringRef Str = Arg.toStringRef(Scratch); + HashBuilder.add(Str); + return ""; + }); + + // Hash the input file paths and module dependencies. These paths may differ + // even if the invocation is identical if they depend on the contents of the + // files in the TU -- for example, case-insensitive paths to modulemap files. + // Usually such a case would indicate a missed optimization to canonicalize, + // but it may be difficult to canonicalize all cases when there is a VFS. + HashBuilder.add(MD.ClangModuleMapFile); + for (const auto &Dep : MD.PrebuiltModuleDeps) + HashBuilder.add(Dep.PCMFile); + for (const auto &ID : MD.ClangModuleDeps) { + HashBuilder.add(ID.ModuleName); + HashBuilder.add(ID.ContextHash); + } + + // Hash options that affect which callbacks are made for outputs. + HashBuilder.add(MD.HadDependencyFile); + HashBuilder.add(MD.HadSerializedDiagnostics); + + llvm::BLAKE3Result<16> Hash = HashBuilder.final(); + std::array Words; + static_assert(sizeof(Hash) == sizeof(Words), "Hash must match Words"); + std::memcpy(Words.data(), Hash.data(), sizeof(Hash)); + return toString(llvm::APInt(sizeof(Words) * 8, Words), 36, /*Signed=*/false); +} + std::vector ModuleDeps::getCanonicalCommandLineWithoutModulePaths() const { return serializeCompilerInvocation(BuildInvocation); @@ -346,13 +404,12 @@ ModuleID ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { .DiagnosticSerializationFile.empty(); MD.HadDependencyFile = !MDC.OriginalInvocation.getDependencyOutputOpts().OutputFile.empty(); - // FIXME: HadSerializedDiagnostics and HadDependencyFile should be included in - // the context hash since it can affect the command-line. - MD.ID.ContextHash = MD.BuildInvocation.getModuleHash(); llvm::DenseSet AddedModules; addAllSubmoduleDeps(M, MD, AddedModules); + // Do this last since it requires the dependencies. + MD.ID.ContextHash = getModuleContextHash(MD); return MD.ID; } diff --git a/clang/test/ClangScanDeps/modules-context-hash-ignore-macros.c b/clang/test/ClangScanDeps/modules-context-hash-ignore-macros.c new file mode 100644 index 000000000000..982bf1e06b51 --- /dev/null +++ b/clang/test/ClangScanDeps/modules-context-hash-ignore-macros.c @@ -0,0 +1,100 @@ +// Ensure '-DFOO -fmodules-ignore-macro=FOO' and '' both produce the same +// canonical module build command. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1 \ +// RUN: -format experimental-full > %t/deps.json +// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK: { +// CHECK: "command-line": [ +// CHECK-NOT: "FOO" +// CHECK-NOT: "-fmodules-ignore-macro +// CHECK: ] +// CHECK: "context-hash": "[[HASH_NO_FOO:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "command-line": [ +// CHECK: "-D" +// CHECK-NEXT: "FOO" +// CHECK: ] +// CHECK: "context-hash": "[[HASH_FOO:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "translation-units": [ +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH_NO_FOO]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK-NOT: "-DFOO" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu1.c" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH_FOO]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK: "-DFOO" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu2.c" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH_NO_FOO]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK: "-DFOO" +// CHECK: "-fmodules-ignore-macro=FOO" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu3.c" + +//--- cdb.json.template +[ + { + "directory": "DIR", + "command": "clang -fsyntax-only DIR/tu1.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu1.c" + }, + { + "directory": "DIR", + "command": "clang -DFOO -fsyntax-only DIR/tu2.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu2.c" + }, + { + "directory": "DIR", + "command": "clang -DFOO -fmodules-ignore-macro=FOO -fsyntax-only DIR/tu3.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu3.c" + }, +] + +//--- module.modulemap +module Mod { header "Mod.h" } + +//--- Mod.h + +//--- tu1.c +#include "Mod.h" + +//--- tu2.c +#include "Mod.h" + +//--- tu3.c +#include "Mod.h" diff --git a/clang/test/ClangScanDeps/modules-context-hash-module-map-path.c b/clang/test/ClangScanDeps/modules-context-hash-module-map-path.c new file mode 100644 index 000000000000..0fde558ab095 --- /dev/null +++ b/clang/test/ClangScanDeps/modules-context-hash-module-map-path.c @@ -0,0 +1,77 @@ +// Ensure the path to the modulemap input is included in the module context hash +// irrespective of other TU command-line arguments, as it effects the canonical +// module build command. In this test we use the difference in spelling between +// module.modulemap and module.map, but it also applies to situations such as +// differences in case-insensitive paths if they are not canonicalized away. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1 -generate-modules-path-args \ +// RUN: -format experimental-full > %t/deps.json + +// RUN: mv %t/module.modulemap %t/module.map +// RUN: echo 'AFTER_MOVE' >> %t/deps.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1 -generate-modules-path-args \ +// RUN: -format experimental-full >> %t/deps.json + +// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK: { +// CHECK: "command-line": [ +// CHECK: "{{.*}}module.modulemap" +// CHECK: ] +// CHECK: "context-hash": "[[HASH1:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "translation-units": [ +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH1]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-LABEL: AFTER_MOVE +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK: { +// CHECK-NOT: [[HASH1]] +// CHECK: "command-line": [ +// CHECK: "{{.*}}module.map" +// CHECK: ] +// CHECK-NOT: [[HASH1]] +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "translation-units": [ +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": +// CHECK-NOT: [[HASH1]] +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] + +//--- cdb.json.template +[ + { + "directory": "DIR", + "command": "clang -fsyntax-only DIR/tu.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu.c" + } +] + +//--- module.modulemap +module Mod { header "Mod.h" } + +//--- Mod.h + +//--- tu.c +#include "Mod.h" diff --git a/clang/test/ClangScanDeps/modules-context-hash-outputs.c b/clang/test/ClangScanDeps/modules-context-hash-outputs.c new file mode 100644 index 000000000000..3eb7ec7020b7 --- /dev/null +++ b/clang/test/ClangScanDeps/modules-context-hash-outputs.c @@ -0,0 +1,77 @@ +// If secondary output files such as .d are enabled, ensure it affects the +// module context hash since it may impact the resulting module build commands. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1 -generate-modules-path-args \ +// RUN: -format experimental-full > %t/deps.json +// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK: { +// CHECK: "command-line": [ +// CHECK: "-dependency-file" +// CHECK: ] +// CHECK: "context-hash": "[[HASH1:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "command-line": [ +// CHECK-NOT: "-dependency-file" +// CHECK: ] +// CHECK: "context-hash": "[[HASH2:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "translation-units": [ +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH1]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK: "-MF" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu1.c" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH2]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK-NOT: "-MF" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu2.c" + +//--- cdb.json.template +[ + { + "directory": "DIR", + "command": "clang -MD -MF DIR/tu1.d -fsyntax-only DIR/tu1.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu1.c" + }, + { + "directory": "DIR", + "command": "clang -fsyntax-only DIR/tu2.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu2.c" + }, +] + +//--- module.modulemap +module Mod { header "Mod.h" } + +//--- Mod.h + +//--- tu1.c +#include "Mod.h" + +//--- tu2.c +#include "Mod.h" diff --git a/clang/test/ClangScanDeps/modules-context-hash-warnings.c b/clang/test/ClangScanDeps/modules-context-hash-warnings.c new file mode 100644 index 000000000000..8457c7db53fb --- /dev/null +++ b/clang/test/ClangScanDeps/modules-context-hash-warnings.c @@ -0,0 +1,77 @@ +// Differences in -W warning options should result in different canonical module +// build commands. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1 \ +// RUN: -format experimental-full > %t/deps.json +// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK: { +// CHECK: "command-line": [ +// CHECK: "-Wall" +// CHECK: ] +// CHECK: "context-hash": "[[HASH1:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "command-line": [ +// CHECK-NOT: "-Wall" +// CHECK: ] +// CHECK: "context-hash": "[[HASH2:.*]]" +// CHECK: "name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "translation-units": [ +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH1]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK: "-Wall" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu1.c" +// CHECK-NEXT: } +// CHECK-NEXT: { +// CHECK: "clang-module-deps": [ +// CHECK-NEXT: { +// CHECK-NEXT: "context-hash": "[[HASH2]]" +// CHECK-NEXT: "module-name": "Mod" +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: "command-line": [ +// CHECK-NOT: "-Wall" +// CHECK: ] +// CHECK: "input-file": "{{.*}}tu2.c" + +//--- cdb.json.template +[ + { + "directory": "DIR", + "command": "clang -Wall -fsyntax-only DIR/tu1.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu1.c" + }, + { + "directory": "DIR", + "command": "clang -fsyntax-only DIR/tu2.c -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache", + "file": "DIR/tu2.c" + }, +] + +//--- module.modulemap +module Mod { header "Mod.h" } + +//--- Mod.h + +//--- tu1.c +#include "Mod.h" + +//--- tu2.c +#include "Mod.h"