clang-format: [JS] nested and tagged template strings.

JavaScript template strings can be nested arbitrarily: foo = `text ${es.map(e => { return `<${e}>`; })} text`; This change lexes nested template strings using a stack of lexer states to correctly switch back to template string lexing on closing braces. Also, reuse the same stack for the token-stashed logic. Reviewers: djasper Subscribers: cfe-commits, klimek Differential Revision: https://reviews.llvm.org/D22431 llvm-svn: 279727
2016-08-25 10:13:21 +00:00 · 2016-08-25 10:13:21 +00:00 · 6181da4796
parent 86ce267a4a
commit 6181da4796
4 changed files with 84 additions and 24 deletions
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@ -26,12 +26,11 @@ namespace format {
 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
                                   const FormatStyle &Style,
                                   encoding::Encoding Encoding)
-    : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false),
-      LessStashed(false), Column(0), TrailingWhitespace(0),
-      SourceMgr(SourceMgr), ID(ID), Style(Style),
-      IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable),
-      Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false),
-      MacroBlockBeginRegex(Style.MacroBlockBegin),
+    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
+      Column(0), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
+      Style(Style), IdentTable(getFormattingLangOpts(Style)),
+      Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
+      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
      MacroBlockEndRegex(Style.MacroBlockEnd) {
  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
                      getFormattingLangOpts(Style)));
@ -49,7 +48,7 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
    Tokens.push_back(getNextToken());
    if (Style.Language == FormatStyle::LK_JavaScript) {
      tryParseJSRegexLiteral();
-      tryParseTemplateString();
+      handleTemplateStrings();
    }
    tryMergePreviousTokens();
    if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
@ -228,17 +227,42 @@ void FormatTokenLexer::tryParseJSRegexLiteral() {
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
 }

-void FormatTokenLexer::tryParseTemplateString() {
+void FormatTokenLexer::handleTemplateStrings() {
  FormatToken *BacktickToken = Tokens.back();
-  if (!BacktickToken->is(tok::unknown) || BacktickToken->TokenText != "`")
+
+  if (BacktickToken->is(tok::l_brace)) {
+    StateStack.push(LexerState::NORMAL);
    return;
+  }
+  if (BacktickToken->is(tok::r_brace)) {
+    StateStack.pop();
+    if (StateStack.top() != LexerState::TEMPLATE_STRING)
+      return;
+    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
+  } else if (BacktickToken->is(tok::unknown) &&
+             BacktickToken->TokenText == "`") {
+    StateStack.push(LexerState::TEMPLATE_STRING);
+  } else {
+    return; // Not actually a template
+  }

  // 'Manually' lex ahead in the current file buffer.
  const char *Offset = Lex->getBufferLocation();
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
-  for (; Offset != Lex->getBuffer().end() && *Offset != '`'; ++Offset) {
-    if (*Offset == '\\')
+  for (; Offset != Lex->getBuffer().end(); ++Offset) {
+    if (Offset[0] == '`') {
+      StateStack.pop();
+      break;
+    }
+    if (Offset[0] == '\\') {
      ++Offset; // Skip the escaped character.
+    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
+               Offset[1] == '{') {
+      // '${' introduces an expression interpolation in the template string.
+      StateStack.push(LexerState::NORMAL);
+      ++Offset;
+      break;
+    }
  }

  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
@ -262,7 +286,10 @@ void FormatTokenLexer::tryParseTemplateString() {
        Style.TabWidth, Encoding);
  }

-  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
+  SourceLocation loc = Offset < Lex->getBuffer().end()
+                           ? Lex->getSourceLocation(Offset + 1)
+                           : SourceMgr.getLocForEndOfFile(ID);
+  resetLexer(SourceMgr.getFileOffset(loc));
 }

 bool FormatTokenLexer::tryMerge_TMacro() {
@ -384,12 +411,8 @@ FormatToken *FormatTokenLexer::getStashedToken() {
 }

 FormatToken *FormatTokenLexer::getNextToken() {
-  if (GreaterStashed) {
-    GreaterStashed = false;
-    return getStashedToken();
-  }
-  if (LessStashed) {
-    LessStashed = false;
+  if (StateStack.top() == LexerState::TOKEN_STASHED) {
+    StateStack.pop();
    return getStashedToken();
  }

@ -500,11 +523,11 @@ FormatToken *FormatTokenLexer::getNextToken() {
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
    FormatTok->Tok.setKind(tok::greater);
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-    GreaterStashed = true;
+    StateStack.push(LexerState::TOKEN_STASHED);
  } else if (FormatTok->Tok.is(tok::lessless)) {
    FormatTok->Tok.setKind(tok::less);
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-    LessStashed = true;
+    StateStack.push(LexerState::TOKEN_STASHED);
  }

  // Now FormatTok is the next non-whitespace token.
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@ -23,9 +23,17 @@
 #include "clang/Format/Format.h"
 #include "llvm/Support/Regex.h"

+#include <stack>
+
 namespace clang {
 namespace format {

+enum LexerState {
+  NORMAL,
+  TEMPLATE_STRING,
+  TOKEN_STASHED,
+};
+
 class FormatTokenLexer {
 public:
  FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
@ -53,7 +61,16 @@ private:
  // its text if successful.
  void tryParseJSRegexLiteral();

-  void tryParseTemplateString();
+  // Handles JavaScript template strings.
+  //
+  // JavaScript template strings use backticks ('`') as delimiters, and allow
+  // embedding expressions nested in ${expr-here}. Template strings can be
+  // nested recursively, i.e. expressions can contain template strings in turn.
+  //
+  // The code below parses starting from a backtick, up to a closing backtick or
+  // an opening ${. It also maintains a stack of lexing contexts to handle
+  // nested template parts by balancing curly braces.
+  void handleTemplateStrings();

  bool tryMerge_TMacro();

@ -65,7 +82,7 @@ private:

  FormatToken *FormatTok;
  bool IsFirstToken;
-  bool GreaterStashed, LessStashed;
+  std::stack<LexerState> StateStack;
  unsigned Column;
  unsigned TrailingWhitespace;
  std::unique_ptr<Lexer> Lex;
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@ -858,7 +858,7 @@ private:
    if (!CurrentToken->isOneOf(TT_LambdaLSquare, TT_ForEachMacro,
                               TT_FunctionLBrace, TT_ImplicitStringLiteral,
                               TT_InlineASMBrace, TT_JsFatArrow, TT_LambdaArrow,
-                               TT_RegexLiteral))
+                               TT_RegexLiteral, TT_TemplateString))
      CurrentToken->Type = TT_Unknown;
    CurrentToken->Role.reset();
    CurrentToken->MatchingParen = nullptr;
@ -1816,6 +1816,9 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
      return 100;
    if (Left.is(TT_JsTypeColon))
      return 35;
+    if ((Left.is(TT_TemplateString) && Left.TokenText.endswith("${")) ||
+        (Right.is(TT_TemplateString) && Right.TokenText.startswith("}")))
+      return 100;
  }

  if (Left.is(tok::comma) || (Right.is(tok::identifier) && Right.Next &&
@ -2114,6 +2117,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
  } else if (Style.Language == FormatStyle::LK_JavaScript) {
    if (Left.is(TT_JsFatArrow))
      return true;
+    if ((Left.is(TT_TemplateString) && Left.TokenText.endswith("${")) ||
+        (Right.is(TT_TemplateString) && Right.TokenText.startswith("}")))
+      return false;
+    if (Left.is(tok::identifier) && Right.is(TT_TemplateString))
+      return false;
    if (Right.is(tok::star) &&
        Left.isOneOf(Keywords.kw_function, Keywords.kw_yield))
      return false;
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@ -1122,7 +1122,7 @@ TEST_F(FormatTestJS, ImportWrapping) {
 TEST_F(FormatTestJS, TemplateStrings) {
  // Keeps any whitespace/indentation within the template string.
  verifyFormat("var x = `hello\n"
-            "     ${  name    }\n"
+            "     ${name}\n"
            "  !`;",
            "var x    =    `hello\n"
                   "     ${  name    }\n"
@ -1206,6 +1206,18 @@ TEST_F(FormatTestJS, TemplateStrings) {
               "var y;",
               "var x = ` \\` a`;\n"
               "var y;");
+  // Escaped dollar.
+  verifyFormat("var x = ` \\${foo}`;\n");
+}
+
+TEST_F(FormatTestJS, NestedTemplateStrings) {
+  verifyFormat(
+      "var x = `<ul>${xs.map(x => `<li>${x}</li>`).join('\\n')}</ul>`;");
+  verifyFormat("var x = `he${({text: 'll'}.text)}o`;");
+}
+
+TEST_F(FormatTestJS, TaggedTemplateStrings) {
+  verifyFormat("var x = html`<ul>`;");
 }

 TEST_F(FormatTestJS, CastSyntax) {