PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.

llvm-svn: 201532
2014-02-17 21:52:30 +00:00 · 2014-02-17 21:52:30 +00:00 · 8b7258bdb3
parent 6287371ce6
commit 8b7258bdb3
6 changed files with 217 additions and 117 deletions
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@ -614,8 +614,28 @@ private:
  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
  ///         invalid.
  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
-};

+  /// \brief Try to consume a UCN as part of an identifier at the current
+  /// location.
+  /// \param CurPtr Initially points to the range of characters in the source
+  ///               buffer containing the '\'. Updated to point past the end of
+  ///               the UCN on success.
+  /// \param Size The number of characters occupied by the '\' (including
+  ///             trigraphs and escaped newlines).
+  /// \param Result The token being produced. Marked as containing a UCN on
+  ///               success.
+  /// \return \c true if a UCN was lexed and it produced an acceptable
+  ///         identifier character, \c false otherwise.
+  bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
+                               Token &Result);
+
+  /// \brief Try to consume an identifier character encoded in UTF-8.
+  /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
+  ///        sequence. On success, updated to point past the end of it.
+  /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
+  ///         character was lexed, \c false otherwise.
+  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
+};

 }  // end namespace clang

--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@ -33,6 +33,9 @@ class TargetInfo;
 class SourceManager;
 class LangOptions;

+/// Copy characters from Input to Buf, expanding any UCNs.
+void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input);
+
 /// NumericLiteralParser - This performs strict semantic analysis of the content
 /// of a ppnumber, classifying it as either integer, floating, or erroneous,
 /// determines the radix of the value and can convert it to a useful value.
@ -48,6 +51,8 @@ class NumericLiteralParser {

  bool saw_exponent, saw_period, saw_ud_suffix;

+  SmallString<32> UDSuffixBuf;
+
 public:
  NumericLiteralParser(StringRef TokSpelling,
                       SourceLocation TokLoc,
@ -72,7 +77,7 @@ public:
  }
  StringRef getUDSuffix() const {
    assert(saw_ud_suffix);
-    return StringRef(SuffixBegin, ThisTokEnd - SuffixBegin);
+    return UDSuffixBuf;
  }
  unsigned getUDSuffixOffset() const {
    assert(saw_ud_suffix);
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
        << Range;
    }
  }
- }
+}
+
+bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
+                                    Token &Result) {
+  const char *UCNPtr = CurPtr + Size;
+  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
+  if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
+    return false;
+
+  if (!isLexingRawMode())
+    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+                              makeCharRange(*this, CurPtr, UCNPtr),
+                              /*IsFirst=*/false);
+
+  Result.setFlag(Token::HasUCN);
+  if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
+      (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
+    CurPtr = UCNPtr;
+  else
+    while (CurPtr != UCNPtr)
+      (void)getAndAdvanceChar(CurPtr, Result);
+  return true;
+}
+
+bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
+  const char *UnicodePtr = CurPtr;
+  UTF32 CodePoint;
+  ConversionResult Result =
+      llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
+                                (const UTF8 *)BufferEnd,
+                                &CodePoint,
+                                strictConversion);
+  if (Result != conversionOK ||
+      !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
+    return false;
+
+  if (!isLexingRawMode())
+    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+                              makeCharRange(*this, CurPtr, UnicodePtr),
+                              /*IsFirst=*/false);
+
+  CurPtr = UnicodePtr;
+  return true;
+}

 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@ -1500,47 +1543,10 @@ FinishIdentifier:
      C = getCharAndSize(CurPtr, Size);
      continue;

-    } else if (C == '\\') {
-      const char *UCNPtr = CurPtr + Size;
-      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
-      if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
-        goto FinishIdentifier;
-
-      if (!isLexingRawMode()) {
-        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
-                                  makeCharRange(*this, CurPtr, UCNPtr),
-                                  /*IsFirst=*/false);
-      }
-
-      Result.setFlag(Token::HasUCN);
-      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
-          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
-        CurPtr = UCNPtr;
-      else
-        while (CurPtr != UCNPtr)
-          (void)getAndAdvanceChar(CurPtr, Result);
-
+    } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
      C = getCharAndSize(CurPtr, Size);
      continue;
-    } else if (!isASCII(C)) {
-      const char *UnicodePtr = CurPtr;
-      UTF32 CodePoint;
-      ConversionResult Result =
-          llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
-                                    (const UTF8 *)BufferEnd,
-                                    &CodePoint,
-                                    strictConversion);
-      if (Result != conversionOK ||
-          !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
-        goto FinishIdentifier;
-
-      if (!isLexingRawMode()) {
-        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
-                                  makeCharRange(*this, CurPtr, UnicodePtr),
-                                  /*IsFirst=*/false);
-      }
-
-      CurPtr = UnicodePtr;
+    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
      C = getCharAndSize(CurPtr, Size);
      continue;
    } else if (!isIdentifierBody(C)) {
@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  unsigned Size;
  char C = getCharAndSize(CurPtr, Size);
  char PrevCh = 0;
-  while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isPreprocessingNumberBody(C)) {
    CurPtr = ConsumeChar(CurPtr, Size, Result);
    PrevCh = C;
    C = getCharAndSize(CurPtr, Size);
@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
    }
  }

+  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
+  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+    return LexNumericConstant(Result, CurPtr);
+  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+    return LexNumericConstant(Result, CurPtr);
+
  // Update the location of token as well as BufferPtr.
  const char *TokStart = BufferPtr;
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
                               bool IsStringLiteral) {
  assert(getLangOpts().CPlusPlus);

-  // Maximally munch an identifier. FIXME: UCNs.
+  // Maximally munch an identifier.
  unsigned Size;
  char C = getCharAndSize(CurPtr, Size);
-  if (isIdentifierHead(C)) {
-    if (!getLangOpts().CPlusPlus11) {
-      if (!isLexingRawMode())
-        Diag(CurPtr,
-             C == '_' ? diag::warn_cxx11_compat_user_defined_literal
-                      : diag::warn_cxx11_compat_reserved_user_defined_literal)
-          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
-      return CurPtr;
-    }
+  bool Consumed = false;

-    // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
-    // that does not start with an underscore is ill-formed. As a conforming
-    // extension, we treat all such suffixes as if they had whitespace before
-    // them.
+  if (!isIdentifierHead(C)) {
+    if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+      Consumed = true;
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+      Consumed = true;
+    else
+      return CurPtr;
+  }
+
+  if (!getLangOpts().CPlusPlus11) {
+    if (!isLexingRawMode())
+      Diag(CurPtr,
+           C == '_' ? diag::warn_cxx11_compat_user_defined_literal
+                    : diag::warn_cxx11_compat_reserved_user_defined_literal)
+        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+    return CurPtr;
+  }
+
+  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
+  // that does not start with an underscore is ill-formed. As a conforming
+  // extension, we treat all such suffixes as if they had whitespace before
+  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
+  // likely to be a ud-suffix than a macro, however, and accept that.
+  if (!Consumed) {
    bool IsUDSuffix = false;
    if (C == '_')
      IsUDSuffix = true;
@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
        Diag(CurPtr, getLangOpts().MSVCCompat
                         ? diag::ext_ms_reserved_user_defined_literal
                         : diag::ext_reserved_user_defined_literal)
-            << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
      return CurPtr;
    }

-    Result.setFlag(Token::HasUDSuffix);
-    do {
-      CurPtr = ConsumeChar(CurPtr, Size, Result);
-      C = getCharAndSize(CurPtr, Size);
-    } while (isIdentifierBody(C));
+    CurPtr = ConsumeChar(CurPtr, Size, Result);
  }
+
+  Result.setFlag(Token::HasUDSuffix);
+  while (true) {
+    C = getCharAndSize(CurPtr, Size);
+    if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
+    else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
+    else break;
+  }
+
  return CurPtr;
 }

--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
  return ResultChar;
 }

+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 /// return the UTF32.
 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
  }

  if (s != ThisTokEnd) {
-    if (isValidUDSuffix(PP.getLangOpts(),
-                        StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) {
+    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
+    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
+    if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
      // Any suffix pieces we might have parsed are actually part of the
      // ud-suffix.
      isLong = false;
@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    do {
      --end;
    } while (end[-1] != '\'');
-    UDSuffixBuf.assign(end, UDSuffixEnd);
+    // FIXME: Don't bother with this if !tok.hasUCN().
+    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
    UDSuffixOffset = end - TokBegin;
  }

@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);

      if (UDSuffixBuf.empty()) {
-        UDSuffixBuf.assign(UDSuffix);
+        if (StringToks[i].hasUCN())
+          expandUCNs(UDSuffixBuf, UDSuffix);
+        else
+          UDSuffixBuf.assign(UDSuffix);
        UDSuffixToken = i;
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
        UDSuffixTokLoc = StringToks[i].getLocation();
-      } else if (!UDSuffixBuf.equals(UDSuffix)) {
+      } else {
+        SmallString<32> ExpandedUDSuffix;
+        if (StringToks[i].hasUCN()) {
+          expandUCNs(ExpandedUDSuffix, UDSuffix);
+          UDSuffix = ExpandedUDSuffix;
+        }
+
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
        // result of a concatenation involving at least one user-defined-string-
        // literal, all the participating user-defined-string-literals shall
        // have the same ud-suffix.
-        if (Diags) {
-          SourceLocation TokLoc = StringToks[i].getLocation();
-          Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
-            << UDSuffixBuf << UDSuffix
-            << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
-            << SourceRange(TokLoc, TokLoc);
+        if (!UDSuffixBuf.equals(UDSuffix)) {
+          if (Diags) {
+            SourceLocation TokLoc = StringToks[i].getLocation();
+            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
+              << UDSuffixBuf << UDSuffix
+              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
+              << SourceRange(TokLoc, TokLoc);
+          }
+          hadError = true;
        }
-        hadError = true;
      }
    }

--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//

-static void appendCodePoint(unsigned Codepoint,
-                            llvm::SmallVectorImpl<char> &Str) {
-  char ResultBuf[4];
-  char *ResultPtr = ResultBuf;
-  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
-  (void)Res;
-  assert(Res && "Unexpected conversion failure");
-  Str.append(ResultBuf, ResultPtr);
-}
-
-static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
-  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
-    if (*I != '\\') {
-      Buf.push_back(*I);
-      continue;
-    }
-
-    ++I;
-    assert(*I == 'u' || *I == 'U');
-
-    unsigned NumHexDigits;
-    if (*I == 'u')
-      NumHexDigits = 4;
-    else
-      NumHexDigits = 8;
-
-    assert(I + NumHexDigits <= E);
-
-    uint32_t CodePoint = 0;
-    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
-      unsigned Value = llvm::hexDigitValue(*I);
-      assert(Value != -1U);
-
-      CodePoint <<= 4;
-      CodePoint += Value;
-    }
-
-    appendCodePoint(CodePoint, Buf);
-    --I;
-  }
-}
-
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@ -111,3 +111,35 @@ void operator "" ""
 U"" // expected-error {{cannot have an encoding prefix}}
 "" _also_not_char(const char *);
 void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}}
+
+// Make sure we treat UCNs and UTF-8 as equivalent.
+int operator""_µs(unsigned long long) {} // expected-note {{previous}}
+int hundred_µs = 50_µs + 50_\u00b5s;
+int operator""_\u00b5s(unsigned long long) {} // expected-error {{redefinition of 'operator "" _µs'}}
+
+int operator""_\U0000212B(long double) {} // expected-note {{previous}}
+int hundred_Å = 50.0_Å + 50._\U0000212B;
+int operator""_Å(long double) {} // expected-error {{redefinition of 'operator "" _Å'}}
+
+int operator""_𐀀(char) {} // expected-note {{previous}}
+int 𐀀 = '4'_𐀀 + '2'_\U00010000;
+int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator "" _𐀀'}}
+
+// These all declare the same function.
+int operator""_℮""_\u212e""_\U0000212e""(const char*, size_t);
+int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t);
+int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t);
+int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e"";
+
+void operator""_℮""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
+void operator""_℮""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
+void operator""_\u212e""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
+void operator""_\u212e""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
+
+void operator""_℮""_℮(unsigned long long) {} // expected-note {{previous}}
+void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}}
+
+#define ¢ *0.01 // expected-error {{macro names must be identifiers}}
+constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}}
+constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}}
+static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}}