Revert "[Clang] Add a warning on invalid UTF-8 in comments."

Reverting while I investigate build failures This reverts commit e3dc56805f.
2022-07-06 19:45:12 +02:00 · 2022-07-06 19:45:12 +02:00 · fb06dd3e8c
parent 23c2bedfd9
commit fb06dd3e8c
6 changed files with 18 additions and 140 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -279,8 +279,6 @@ Improvements to Clang's diagnostics
  unevaluated operands of a ``typeid`` expression, as they are now
  modeled correctly in the CFG. This fixes
  `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
 - Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
  comments.
 Non-comprehensive list of changes in this release
 -------------------------------------------------
@ -578,7 +576,7 @@ AST Matchers
 - Added ``forEachTemplateArgument`` matcher which creates a match every
  time a ``templateArgument`` matches the matcher supplied to it.
-
+  
 - Added ``objcStringLiteral`` matcher which matches ObjectiveC String
  literal expressions.
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
 // Unicode and UCNs
 def err_invalid_utf8 : Error<
  "source file is not valid UTF-8">;
 def warn_invalid_utf8_in_comment : Extension<
  "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
 def err_character_not_allowed : Error<
  "unexpected character <U+%0>">;
 def err_character_not_allowed_identifier : Error<
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
  //
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
  // character that ends the line comment.
  // C++23 [lex.phases] p1
  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
  // diagnostic only once per entire ill-formed subsequence to avoid
  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
  bool UnicodeDecodingAlreadyDiagnosed = false;
  char C;
  while (true) {
    C = *CurPtr;
    // Skip over characters in the fast loop.
-    while (isASCII(C) && C != 0 &&   // Potentially EOF.
+    while (C != 0 &&                // Potentially EOF.
-           C != '\n' && C != '\r') { // Newline or DOS-style newline.
+           C != '\n' && C != '\r')  // Newline or DOS-style newline.
      C = *++CurPtr;
      UnicodeDecodingAlreadyDiagnosed = false;
    }
    if (!isASCII(C)) {
      unsigned Length = llvm::getUTF8SequenceSize(
          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
      if (Length == 0) {
        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
        UnicodeDecodingAlreadyDiagnosed = true;
        ++CurPtr;
      } else {
        UnicodeDecodingAlreadyDiagnosed = false;
        CurPtr += Length;
      }
      continue;
    }
    const char *NextLine = CurPtr;
    if (C != 0) {
@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
  if (C == '/')
    C = *CurPtr++;
  // C++23 [lex.phases] p1
  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
  // diagnostic only once per entire ill-formed subsequence to avoid
  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
  bool UnicodeDecodingAlreadyDiagnosed = false;
  while (true) {
    // Skip over all non-interesting characters until we find end of buffer or a
    // (probably ending) '/' character.
@ -2703,24 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
        // doesn't check for '\0'.
        !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
      // While not aligned to a 16-byte boundary.
-      while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
+      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
        if (!isASCII(C)) {
          CurPtr--;
          goto MultiByteUTF8;
        }
        C = *CurPtr++;
-      }
+
      if (C == '/') goto FoundSlash;
 #ifdef __SSE2__
      __m128i Slashes = _mm_set1_epi8('/');
-      while (CurPtr + 16 < BufferEnd) {
+      while (CurPtr+16 <= BufferEnd) {
        int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
        if (LLVM_UNLIKELY(Mask != 0)) {
          CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
          goto MultiByteUTF8;
        }
        // look for slashes
        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
                                    Slashes));
        if (cmp != 0) {
@ -2733,41 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
        CurPtr += 16;
      }
 #elif __ALTIVEC__
      __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                                        0x80, 0x80, 0x80, 0x80};
      __vector unsigned char Slashes = {
        '/', '/', '/', '/',  '/', '/', '/', '/',
        '/', '/', '/', '/',  '/', '/', '/', '/'
      };
-      while (CurPtr + 16 < BufferEnd) {
+      while (CurPtr + 16 <= BufferEnd &&
-        if (LLVM_UNLIKELY(
+             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
                vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
          goto MultiByteUTF8;
        if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
          C = *CurPtr++;
          break;
        }
        CurPtr += 16;
      }
 #else
-      while (CurPtr + 16 <= BufferEnd) {
+      // Scan for '/' quickly.  Many block comments are very large.
-        bool HasNonASCII = false;
+      while (CurPtr[0] != '/' &&
-        for (unsigned I = 0; I < 16; ++I) {
+             CurPtr[1] != '/' &&
-          HasNonASCII |= !isASCII(CurPtr[I]);
+             CurPtr[2] != '/' &&
-        }
+             CurPtr[3] != '/' &&
-
+             CurPtr+4 < BufferEnd) {
-        if (LLVM_UNLIKELY(HasNonASCII))
+        CurPtr += 4;
          goto MultiByteUTF8;
        bool HasSlash = false;
        for (unsigned I = 0; I < 16; ++I) {
          HasSlash |= CurPtr[I] == '/';
        }
        if (HasSlash)
          break;
        CurPtr += 16;
      }
 #endif
@ -2775,28 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
      C = *CurPtr++;
    }
-    // Loop to scan the remainder, warning on invalid UTF-8
+    // Loop to scan the remainder.
-    // if the corresponding warning is enabled, emitting a diagnostic only once
+    while (C != '/' && C != '\0')
-    // per sequence that cannot be decoded.
+      C = *CurPtr++;
    while (C != '/' && C != '\0') {
      if (isASCII(C)) {
        UnicodeDecodingAlreadyDiagnosed = false;
        C = *CurPtr++;
        continue;
      }
    MultiByteUTF8:
      unsigned Length = llvm::getUTF8SequenceSize(
          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
      if (Length == 0) {
        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
        UnicodeDecodingAlreadyDiagnosed = true;
        C = *CurPtr++;
        continue;
      }
      UnicodeDecodingAlreadyDiagnosed = false;
      C = *(CurPtr += Length - 1);
    }
    if (C == '/') {
  FoundSlash:
--- a/clang/test/Lexer/comment-invalid-utf8.c
+++ b/clang/test/Lexer/comment-invalid-utf8.c
@ -1,27 +0,0 @@
 // RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
 // RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
 // nowarn-no-diagnostics
 // This file is purposefully encoded as windows-1252
 // be careful when modifying.
 //€
 // expected-warning@-1 {{invalid UTF-8 in comment}}
 // € ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž
 // expected-warning@-1 6{{invalid UTF-8 in comment}}
 /*€*/
 // expected-warning@-1 {{invalid UTF-8 in comment}}
 /*€ ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž*/
 // expected-warning@-1 6{{invalid UTF-8 in comment}}
 /*
 €
 */
 // expected-warning@-2 {{invalid UTF-8 in comment}}
 // abcd
 // €abcd
 // expected-warning@-1 {{invalid UTF-8 in comment}}
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 /*************************************************************************/
--- a/llvm/lib/Support/ConvertUTF.cpp
+++ b/llvm/lib/Support/ConvertUTF.cpp
@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
    return isLegalUTF8(source, length);
 }
 /*
 * Exported function to return the size of the first utf-8 code unit sequence,
 * Or 0 if the sequence is not valid;
 */
 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
  int length = trailingBytesForUTF8[*source] + 1;
  return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
                                                                      : 0;
 }
 /* --------------------------------------------------------------------- */
 static unsigned