llvm-project/clang-tools-extra/pseudo/unittests/TokenTest.cpp

//===--- TokenTest.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Token.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/TokenKinds.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"

namespace clang {
namespace pseudo {
namespace {

using testing::AllOf;
using testing::ElementsAre;
using testing::ElementsAreArray;
using testing::Not;

MATCHER_P2(token, Text, Kind, "") {
  return arg.Kind == Kind && arg.text() == Text;
}

MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }

MATCHER_P2(lineIndent, Line, Indent, "") {
  return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
}

MATCHER_P(originalIndex, index, "") {
  return arg.OriginalIndex == (Token::Index)index;
}

TEST(TokenTest, Lex) {
  LangOptions Opts;
  std::string Code = R"cpp(
    #include <stdio.h>
    int main() {
      return 42; // the answer
    }
  )cpp";
  TokenStream Raw = lex(Code, Opts);
  ASSERT_TRUE(Raw.isFinalized());
  EXPECT_THAT(Raw.tokens(),
              ElementsAreArray({
                  // Lexing of directives is weird, especially <angled> strings.
                  token("#", tok::hash),
                  token("include", tok::raw_identifier),
                  token("<", tok::less),
                  token("stdio", tok::raw_identifier),
                  token(".", tok::period),
                  token("h", tok::raw_identifier),
                  token(">", tok::greater),

                  token("int", tok::raw_identifier),
                  token("main", tok::raw_identifier),
                  token("(", tok::l_paren),
                  token(")", tok::r_paren),
                  token("{", tok::l_brace),
                  token("return", tok::raw_identifier),
                  token("42", tok::numeric_constant),
                  token(";", tok::semi),
                  token("// the answer", tok::comment),
                  token("}", tok::r_brace),
              }));

  TokenStream Cooked = cook(Raw, Opts);
  ASSERT_TRUE(Cooked.isFinalized());
  EXPECT_THAT(Cooked.tokens(),
              ElementsAreArray({
                  // Cooked identifier types in directives are not meaningful.
                  token("#", tok::hash),
                  token("include", tok::identifier),
                  token("<", tok::less),
                  token("stdio", tok::identifier),
                  token(".", tok::period),
                  token("h", tok::identifier),
                  token(">", tok::greater),

                  token("int", tok::kw_int),
                  token("main", tok::identifier),
                  token("(", tok::l_paren),
                  token(")", tok::r_paren),
                  token("{", tok::l_brace),
                  token("return", tok::kw_return),
                  token("42", tok::numeric_constant),
                  token(";", tok::semi),
                  token("// the answer", tok::comment),
                  token("}", tok::r_brace),
              }));
  // Check raw tokens point back into original source code.
  EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
}

TEST(TokenTest, LineContinuation) {
  LangOptions Opts;
  std::string Code = R"cpp(
one_\
token
two \
tokens
  )cpp";
  TokenStream Raw = lex(Code, Opts);
  EXPECT_THAT(
      Raw.tokens(),
      ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
                        hasFlag(LexFlags::StartsPPLine),
                        hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0),
                        originalIndex(0)),
                  AllOf(token("two", tok::raw_identifier),
                        hasFlag(LexFlags::StartsPPLine),
                        Not(hasFlag(LexFlags::NeedsCleaning)),
                        originalIndex(1)),
                  AllOf(token("\\\ntokens", tok::raw_identifier),
                        Not(hasFlag(LexFlags::StartsPPLine)),
                        hasFlag(LexFlags::NeedsCleaning), originalIndex(2))));

  TokenStream Cooked = cook(Raw, Opts);
  EXPECT_THAT(
      Cooked.tokens(),
      ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0),
                        originalIndex(0)),
                  AllOf(token("two", tok::identifier), originalIndex(1)),
                  AllOf(token("tokens", tok::identifier), originalIndex(2))));
}

TEST(TokenTest, EncodedCharacters) {
  LangOptions Opts;
  Opts.Trigraphs = true;
  Opts.Digraphs = true;
  Opts.C99 = true; // UCNs
  Opts.CXXOperatorNames = true;
  std::string Code = R"(and <: ??! '??=' \u00E9)";
  TokenStream Raw = lex(Code, Opts);
  EXPECT_THAT(
      Raw.tokens(),
      ElementsAre( // and is not recognized as && until cook().
          AllOf(token("and", tok::raw_identifier),
                Not(hasFlag(LexFlags::NeedsCleaning))),
          // Digraphs are just different spellings of tokens.
          AllOf(token("<:", tok::l_square),
                Not(hasFlag(LexFlags::NeedsCleaning))),
          // Trigraps are interpreted, still need text cleaning.
          AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
          // Trigraphs must be substituted inside constants too.
          AllOf(token(R"('??=')", tok::char_constant),
                hasFlag(LexFlags::NeedsCleaning)),
          // UCNs need substitution.
          AllOf(token(R"(\u00E9)", tok::raw_identifier),
                hasFlag(LexFlags::NeedsCleaning))));

  TokenStream Cooked = cook(Raw, Opts);
  EXPECT_THAT(
      Cooked.tokens(),
      ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
                  token("<:", tok::l_square),
                  token("|", tok::pipe),            // trigraph substituted
                  token("'#'", tok::char_constant), // trigraph substituted
                  token("é", tok::identifier)));    // UCN substituted
}

TEST(TokenTest, Indentation) {
  LangOptions Opts;
  std::string Code = R"cpp(   hello world
no_indent \
  line_was_continued
)cpp";
  TokenStream Raw = lex(Code, Opts);
  EXPECT_THAT(Raw.tokens(), ElementsAreArray({
                                lineIndent(0, 3), // hello
                                lineIndent(0, 3), // world
                                lineIndent(1, 0), // no_indent
                                lineIndent(2, 2), // line_was_continued
                            }));
}

TEST(TokenTest, SplitGreaterGreater) {
  LangOptions Opts;
  std::string Code = R"cpp(
>> // split
// >> with an escaped newline in the middle, split
>\
>
>>= // not split
)cpp";
  TokenStream Cook = cook(lex(Code, Opts), Opts);
  TokenStream Split = stripComments(Cook);
  EXPECT_THAT(Split.tokens(),
              ElementsAre(AllOf(token(">", tok::greater), originalIndex(0)),
                          AllOf(token(">", tok::greater), originalIndex(0)),
                          // Token 1 and 2 are comments.
                          AllOf(token(">", tok::greater), originalIndex(3)),
                          AllOf(token(">", tok::greater), originalIndex(3)),
                          AllOf(token(">>=", tok::greatergreaterequal),
                                originalIndex(4))));
}

TEST(TokenTest, DropComments) {
  LangOptions Opts;
  std::string Code = R"cpp(
  // comment
  int /*abc*/;
)cpp";
  TokenStream Raw = cook(lex(Code, Opts), Opts);
  TokenStream Stripped = stripComments(Raw);
  EXPECT_THAT(
      Raw.tokens(),
      ElementsAre(AllOf(token("// comment", tok::comment), originalIndex(0)),
                  AllOf(token("int", tok::kw_int), originalIndex(1)),
                  AllOf(token("/*abc*/", tok::comment), originalIndex(2)),
                  AllOf(token(";", tok::semi), originalIndex(3))));

  EXPECT_THAT(Stripped.tokens(),
              ElementsAre(AllOf(token("int", tok::kw_int), originalIndex(1)),
                          AllOf(token(";", tok::semi), originalIndex(3))));
}

} // namespace
} // namespace pseudo
} // namespace clang