PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.

llvm-svn: 201532
2014-02-17 21:52:30 +00:00 · 2014-02-17 21:52:30 +00:00 · 8b7258bdb3
parent 6287371ce6
commit 8b7258bdb3
6 changed files with 217 additions and 117 deletions
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@ -614,8 +614,28 @@ private:
  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
  ///         invalid.
  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
 };
  /// \brief Try to consume a UCN as part of an identifier at the current
  /// location.
  /// \param CurPtr Initially points to the range of characters in the source
  ///               buffer containing the '\'. Updated to point past the end of
  ///               the UCN on success.
  /// \param Size The number of characters occupied by the '\' (including
  ///             trigraphs and escaped newlines).
  /// \param Result The token being produced. Marked as containing a UCN on
  ///               success.
  /// \return \c true if a UCN was lexed and it produced an acceptable
  ///         identifier character, \c false otherwise.
  bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
                               Token &Result);
  /// \brief Try to consume an identifier character encoded in UTF-8.
  /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
  ///        sequence. On success, updated to point past the end of it.
  /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
  ///         character was lexed, \c false otherwise.
  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
 };
 }  // end namespace clang
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@ -33,6 +33,9 @@ class TargetInfo;
 class SourceManager;
 class LangOptions;
 /// Copy characters from Input to Buf, expanding any UCNs.
 void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input);
 /// NumericLiteralParser - This performs strict semantic analysis of the content
 /// of a ppnumber, classifying it as either integer, floating, or erroneous,
 /// determines the radix of the value and can convert it to a useful value.
@ -48,6 +51,8 @@ class NumericLiteralParser {
  bool saw_exponent, saw_period, saw_ud_suffix;
  SmallString<32> UDSuffixBuf;
 public:
  NumericLiteralParser(StringRef TokSpelling,
                       SourceLocation TokLoc,
@ -72,7 +77,7 @@ public:
  }
  StringRef getUDSuffix() const {
    assert(saw_ud_suffix);
-    return StringRef(SuffixBegin, ThisTokEnd - SuffixBegin);
+    return UDSuffixBuf;
  }
  unsigned getUDSuffixOffset() const {
    assert(saw_ud_suffix);
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
        << Range;
    }
  }
- }
+}
 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
                                    Token &Result) {
  const char *UCNPtr = CurPtr + Size;
  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
  if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
    return false;
  if (!isLexingRawMode())
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                              makeCharRange(*this, CurPtr, UCNPtr),
                              /*IsFirst=*/false);
  Result.setFlag(Token::HasUCN);
  if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
      (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
    CurPtr = UCNPtr;
  else
    while (CurPtr != UCNPtr)
      (void)getAndAdvanceChar(CurPtr, Result);
  return true;
 }
 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
  const char *UnicodePtr = CurPtr;
  UTF32 CodePoint;
  ConversionResult Result =
      llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
                                (const UTF8 *)BufferEnd,
                                &CodePoint,
                                strictConversion);
  if (Result != conversionOK ||
      !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
    return false;
  if (!isLexingRawMode())
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                              makeCharRange(*this, CurPtr, UnicodePtr),
                              /*IsFirst=*/false);
  CurPtr = UnicodePtr;
  return true;
 }
 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@ -1500,47 +1543,10 @@ FinishIdentifier:
      C = getCharAndSize(CurPtr, Size);
      continue;
-    } else if (C == '\\') {
+    } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
      const char *UCNPtr = CurPtr + Size;
      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
      if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
        goto FinishIdentifier;
      if (!isLexingRawMode()) {
        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                                  makeCharRange(*this, CurPtr, UCNPtr),
                                  /*IsFirst=*/false);
      }
      Result.setFlag(Token::HasUCN);
      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
        CurPtr = UCNPtr;
      else
        while (CurPtr != UCNPtr)
          (void)getAndAdvanceChar(CurPtr, Result);
      C = getCharAndSize(CurPtr, Size);
      continue;
-    } else if (!isASCII(C)) {
+    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
      const char *UnicodePtr = CurPtr;
      UTF32 CodePoint;
      ConversionResult Result =
          llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
                                    (const UTF8 *)BufferEnd,
                                    &CodePoint,
                                    strictConversion);
      if (Result != conversionOK ||
          !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
        goto FinishIdentifier;
      if (!isLexingRawMode()) {
        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                                  makeCharRange(*this, CurPtr, UnicodePtr),
                                  /*IsFirst=*/false);
      }
      CurPtr = UnicodePtr;
      C = getCharAndSize(CurPtr, Size);
      continue;
    } else if (!isIdentifierBody(C)) {
@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  unsigned Size;
  char C = getCharAndSize(CurPtr, Size);
  char PrevCh = 0;
-  while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isPreprocessingNumberBody(C)) {
    CurPtr = ConsumeChar(CurPtr, Size, Result);
    PrevCh = C;
    C = getCharAndSize(CurPtr, Size);
@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
    }
  }
  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
    return LexNumericConstant(Result, CurPtr);
  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
    return LexNumericConstant(Result, CurPtr);
  // Update the location of token as well as BufferPtr.
  const char *TokStart = BufferPtr;
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
                               bool IsStringLiteral) {
  assert(getLangOpts().CPlusPlus);
-  // Maximally munch an identifier. FIXME: UCNs.
+  // Maximally munch an identifier.
  unsigned Size;
  char C = getCharAndSize(CurPtr, Size);
-  if (isIdentifierHead(C)) {
+  bool Consumed = false;
    if (!getLangOpts().CPlusPlus11) {
      if (!isLexingRawMode())
        Diag(CurPtr,
             C == '_' ? diag::warn_cxx11_compat_user_defined_literal
                      : diag::warn_cxx11_compat_reserved_user_defined_literal)
          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
      return CurPtr;
    }
-    // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
+  if (!isIdentifierHead(C)) {
-    // that does not start with an underscore is ill-formed. As a conforming
+    if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
-    // extension, we treat all such suffixes as if they had whitespace before
+      Consumed = true;
-    // them.
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
      Consumed = true;
    else
      return CurPtr;
  }
  if (!getLangOpts().CPlusPlus11) {
    if (!isLexingRawMode())
      Diag(CurPtr,
           C == '_' ? diag::warn_cxx11_compat_user_defined_literal
                    : diag::warn_cxx11_compat_reserved_user_defined_literal)
        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
    return CurPtr;
  }
  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
  // that does not start with an underscore is ill-formed. As a conforming
  // extension, we treat all such suffixes as if they had whitespace before
  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
  // likely to be a ud-suffix than a macro, however, and accept that.
  if (!Consumed) {
    bool IsUDSuffix = false;
    if (C == '_')
      IsUDSuffix = true;
@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
        Diag(CurPtr, getLangOpts().MSVCCompat
                         ? diag::ext_ms_reserved_user_defined_literal
                         : diag::ext_reserved_user_defined_literal)
-            << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
      return CurPtr;
    }
-    Result.setFlag(Token::HasUDSuffix);
+    CurPtr = ConsumeChar(CurPtr, Size, Result);
    do {
      CurPtr = ConsumeChar(CurPtr, Size, Result);
      C = getCharAndSize(CurPtr, Size);
    } while (isIdentifierBody(C));
  }
  Result.setFlag(Token::HasUDSuffix);
  while (true) {
    C = getCharAndSize(CurPtr, Size);
    if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
    else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
    else break;
  }
  return CurPtr;
 }
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
  return ResultChar;
 }
 static void appendCodePoint(unsigned Codepoint,
                            llvm::SmallVectorImpl<char> &Str) {
  char ResultBuf[4];
  char *ResultPtr = ResultBuf;
  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
  (void)Res;
  assert(Res && "Unexpected conversion failure");
  Str.append(ResultBuf, ResultPtr);
 }
 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
    if (*I != '\\') {
      Buf.push_back(*I);
      continue;
    }
    ++I;
    assert(*I == 'u' || *I == 'U');
    unsigned NumHexDigits;
    if (*I == 'u')
      NumHexDigits = 4;
    else
      NumHexDigits = 8;
    assert(I + NumHexDigits <= E);
    uint32_t CodePoint = 0;
    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
      unsigned Value = llvm::hexDigitValue(*I);
      assert(Value != -1U);
      CodePoint <<= 4;
      CodePoint += Value;
    }
    appendCodePoint(CodePoint, Buf);
    --I;
  }
 }
 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 /// return the UTF32.
 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
  }
  if (s != ThisTokEnd) {
-    if (isValidUDSuffix(PP.getLangOpts(),
+    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
-                        StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) {
+    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
    if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
      // Any suffix pieces we might have parsed are actually part of the
      // ud-suffix.
      isLong = false;
@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    do {
      --end;
    } while (end[-1] != '\'');
-    UDSuffixBuf.assign(end, UDSuffixEnd);
+    // FIXME: Don't bother with this if !tok.hasUCN().
    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
    UDSuffixOffset = end - TokBegin;
  }
@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
      if (UDSuffixBuf.empty()) {
-        UDSuffixBuf.assign(UDSuffix);
+        if (StringToks[i].hasUCN())
          expandUCNs(UDSuffixBuf, UDSuffix);
        else
          UDSuffixBuf.assign(UDSuffix);
        UDSuffixToken = i;
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
        UDSuffixTokLoc = StringToks[i].getLocation();
-      } else if (!UDSuffixBuf.equals(UDSuffix)) {
+      } else {
        SmallString<32> ExpandedUDSuffix;
        if (StringToks[i].hasUCN()) {
          expandUCNs(ExpandedUDSuffix, UDSuffix);
          UDSuffix = ExpandedUDSuffix;
        }
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
        // result of a concatenation involving at least one user-defined-string-
        // literal, all the participating user-defined-string-literals shall
        // have the same ud-suffix.
-        if (Diags) {
+        if (!UDSuffixBuf.equals(UDSuffix)) {
-          SourceLocation TokLoc = StringToks[i].getLocation();
+          if (Diags) {
-          Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
+            SourceLocation TokLoc = StringToks[i].getLocation();
-            << UDSuffixBuf << UDSuffix
+            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
-            << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
+              << UDSuffixBuf << UDSuffix
-            << SourceRange(TokLoc, TokLoc);
+              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
              << SourceRange(TokLoc, TokLoc);
          }
          hadError = true;
        }
        hadError = true;
      }
    }
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 static void appendCodePoint(unsigned Codepoint,
                            llvm::SmallVectorImpl<char> &Str) {
  char ResultBuf[4];
  char *ResultPtr = ResultBuf;
  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
  (void)Res;
  assert(Res && "Unexpected conversion failure");
  Str.append(ResultBuf, ResultPtr);
 }
 static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
    if (*I != '\\') {
      Buf.push_back(*I);
      continue;
    }
    ++I;
    assert(*I == 'u' || *I == 'U');
    unsigned NumHexDigits;
    if (*I == 'u')
      NumHexDigits = 4;
    else
      NumHexDigits = 8;
    assert(I + NumHexDigits <= E);
    uint32_t CodePoint = 0;
    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
      unsigned Value = llvm::hexDigitValue(*I);
      assert(Value != -1U);
      CodePoint <<= 4;
      CodePoint += Value;
    }
    appendCodePoint(CodePoint, Buf);
    --I;
  }
 }
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@ -111,3 +111,35 @@ void operator "" ""
 U"" // expected-error {{cannot have an encoding prefix}}
 "" _also_not_char(const char *);
 void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}}
 // Make sure we treat UCNs and UTF-8 as equivalent.
 int operator""_µs(unsigned long long) {} // expected-note {{previous}}
 int hundred_µs = 50_µs + 50_\u00b5s;
 int operator""_\u00b5s(unsigned long long) {} // expected-error {{redefinition of 'operator "" _µs'}}
 int operator""_\U0000212B(long double) {} // expected-note {{previous}}
 int hundred_Å = 50.0_Å + 50._\U0000212B;
 int operator""_Å(long double) {} // expected-error {{redefinition of 'operator "" _Å'}}
 int operator""_𐀀(char) {} // expected-note {{previous}}
 int 𐀀 = '4'_𐀀 + '2'_\U00010000;
 int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator "" _𐀀'}}
 // These all declare the same function.
 int operator""_℮""_\u212e""_\U0000212e""(const char*, size_t);
 int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t);
 int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t);
 int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e"";
 void operator""_℮""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
 void operator""_℮""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
 void operator""_\u212e""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
 void operator""_\u212e""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
 void operator""_℮""_℮(unsigned long long) {} // expected-note {{previous}}
 void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}}
 #define ¢ *0.01 // expected-error {{macro names must be identifiers}}
 constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}}
 constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}}
 static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}}