[clang][Syntax] Optimize expandedTokens for token ranges.

`expandedTokens(SourceRange)` used to do a binary search to get the expanded tokens belonging to a source range. Each binary search uses `isBeforeInTranslationUnit` to order two source locations. This is inherently very slow. By profiling clangd we found out that users like clangd::SelectionTree spend 95% of time in `isBeforeInTranslationUnit`. Also it is worth noting that users of `expandedTokens(SourceRange)` majorly use ranges provided by AST to query this funciton. The ranges provided by AST are token ranges (starting at the beginning of a token and ending at the beginning of another token). Therefore we can avoid the binary search in majority of the cases by maintaining an index of ExpandedToken by their SourceLocations. We still do binary search for ranges which are not token ranges but such instances are quite low. Performance: `~/build/bin/clangd --check=clang/lib/Serialization/ASTReader.cpp` Before: Took 2:10s to complete. Now: Took 1:13s to complete. Differential Revision: https://reviews.llvm.org/D99086
2021-03-22 15:40:37 +01:00 · 2021-03-22 15:40:37 +01:00 · aa979084df
parent 27899112c6
commit aa979084df
4 changed files with 35 additions and 0 deletions
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@ -423,6 +423,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
  // tokens from running the preprocessor inside the checks (only
  // modernize-use-trailing-return-type does that today).
  syntax::TokenBuffer Tokens = std::move(CollectTokens).consume();
+  // Makes SelectionTree build much faster.
+  Tokens.indexExpandedTokens();
  std::vector<Decl *> ParsedDecls = Action->takeTopLevelDecls();
  // AST traversals should exclude the preamble, to avoid performance cliffs.
  Clang->getASTContext().setTraversalScope(ParsedDecls);
--- a/clang/include/clang/Tooling/Syntax/Tokens.h
+++ b/clang/include/clang/Tooling/Syntax/Tokens.h
@ -34,6 +34,7 @@
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@ -192,8 +193,13 @@ public:
    return ExpandedTokens;
  }

+  /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
+  /// Creates an index only once. Further calls to it will be no-op.
+  void indexExpandedTokens();
+
  /// Returns the subrange of expandedTokens() corresponding to the closed
  /// token range R.
+  /// Consider calling indexExpandedTokens() before for faster lookups.
  llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;

  /// Returns the subrange of spelled tokens corresponding to AST node spanning
@ -366,6 +372,8 @@ private:
  /// same stream as 'clang -E' (excluding the preprocessor directives like
  /// #file, etc.).
  std::vector<syntax::Token> ExpandedTokens;
+  // Index of ExpandedTokens for faster lookups by SourceLocation.
+  llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
  llvm::DenseMap<FileID, MarkedFile> Files;
  // The value is never null, pointer instead of reference to avoid disabling
  // implicit assignment operator.
--- a/clang/lib/Tooling/Syntax/Tokens.cpp
+++ b/clang/lib/Tooling/Syntax/Tokens.cpp
@ -183,7 +183,31 @@ llvm::StringRef FileRange::text(const SourceManager &SM) const {
  return Text.substr(Begin, length());
 }

+void TokenBuffer::indexExpandedTokens() {
+  // No-op if the index is already created.
+  if (!ExpandedTokIndex.empty())
+    return;
+  ExpandedTokIndex.reserve(ExpandedTokens.size());
+  // Index ExpandedTokens for faster lookups by SourceLocation.
+  for (size_t I = 0, E = ExpandedTokens.size(); I != E; ++I)
+    ExpandedTokIndex[ExpandedTokens[I].location()] = I;
+}
+
 llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const {
+  if (!ExpandedTokIndex.empty()) {
+    // Quick lookup if `R` is a token range.
+    // This is a huge win since majority of the users use ranges provided by an
+    // AST. Ranges in AST are token ranges from expanded token stream.
+    const auto B = ExpandedTokIndex.find(R.getBegin());
+    const auto E = ExpandedTokIndex.find(R.getEnd());
+    if (B != ExpandedTokIndex.end() && E != ExpandedTokIndex.end()) {
+      // Add 1 to End to make a half-open range.
+      return {ExpandedTokens.data() + B->getSecond(),
+              ExpandedTokens.data() + E->getSecond() + 1};
+    }
+  }
+  // Slow case. Use `isBeforeInTranslationUnit` to binary search for the
+  // required range.
  return getTokensCovering(expandedTokens(), R, *SourceMgr);
 }

--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@ -106,6 +106,7 @@ public:
      void EndSourceFileAction() override {
        assert(Collector && "BeginSourceFileAction was never called");
        Result = std::move(*Collector).consume();
+        Result.indexExpandedTokens();
      }

      std::unique_ptr<ASTConsumer>