Revert "[Clang] Add a warning on invalid UTF-8 in comments."

Reverting while I investigate build failures

This reverts commit e3dc56805f.
This commit is contained in:
Corentin Jabot 2022-07-06 19:45:12 +02:00
parent 23c2bedfd9
commit fb06dd3e8c
6 changed files with 18 additions and 140 deletions

View File

@ -279,8 +279,6 @@ Improvements to Clang's diagnostics
unevaluated operands of a ``typeid`` expression, as they are now
modeled correctly in the CFG. This fixes
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
comments.
Non-comprehensive list of changes in this release
-------------------------------------------------
@ -578,7 +576,7 @@ AST Matchers
- Added ``forEachTemplateArgument`` matcher which creates a match every
time a ``templateArgument`` matches the matcher supplied to it.
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
literal expressions.

View File

@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
def warn_invalid_utf8_in_comment : Extension<
"invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<

View File

@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;
char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
while (isASCII(C) && C != 0 && // Potentially EOF.
C != '\n' && C != '\r') { // Newline or DOS-style newline.
while (C != 0 && // Potentially EOF.
C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++CurPtr;
UnicodeDecodingAlreadyDiagnosed = false;
}
if (!isASCII(C)) {
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
++CurPtr;
} else {
UnicodeDecodingAlreadyDiagnosed = false;
CurPtr += Length;
}
continue;
}
const char *NextLine = CurPtr;
if (C != 0) {
@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
if (C == '/')
C = *CurPtr++;
// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;
while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
@ -2703,24 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
if (!isASCII(C)) {
CurPtr--;
goto MultiByteUTF8;
}
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
}
if (C == '/') goto FoundSlash;
#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
while (CurPtr + 16 < BufferEnd) {
int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
if (LLVM_UNLIKELY(Mask != 0)) {
CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
goto MultiByteUTF8;
}
// look for slashes
while (CurPtr+16 <= BufferEnd) {
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
@ -2733,41 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
CurPtr += 16;
}
#elif __ALTIVEC__
__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
while (CurPtr + 16 < BufferEnd) {
if (LLVM_UNLIKELY(
vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
goto MultiByteUTF8;
if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
C = *CurPtr++;
break;
}
while (CurPtr + 16 <= BufferEnd &&
!vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
CurPtr += 16;
}
#else
while (CurPtr + 16 <= BufferEnd) {
bool HasNonASCII = false;
for (unsigned I = 0; I < 16; ++I) {
HasNonASCII |= !isASCII(CurPtr[I]);
}
if (LLVM_UNLIKELY(HasNonASCII))
goto MultiByteUTF8;
bool HasSlash = false;
for (unsigned I = 0; I < 16; ++I) {
HasSlash |= CurPtr[I] == '/';
}
if (HasSlash)
break;
CurPtr += 16;
// Scan for '/' quickly. Many block comments are very large.
while (CurPtr[0] != '/' &&
CurPtr[1] != '/' &&
CurPtr[2] != '/' &&
CurPtr[3] != '/' &&
CurPtr+4 < BufferEnd) {
CurPtr += 4;
}
#endif
@ -2775,28 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
C = *CurPtr++;
}
// Loop to scan the remainder, warning on invalid UTF-8
// if the corresponding warning is enabled, emitting a diagnostic only once
// per sequence that cannot be decoded.
while (C != '/' && C != '\0') {
if (isASCII(C)) {
UnicodeDecodingAlreadyDiagnosed = false;
C = *CurPtr++;
continue;
}
MultiByteUTF8:
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
C = *CurPtr++;
continue;
}
UnicodeDecodingAlreadyDiagnosed = false;
C = *(CurPtr += Length - 1);
}
// Loop to scan the remainder.
while (C != '/' && C != '\0')
C = *CurPtr++;
if (C == '/') {
FoundSlash:

View File

@ -1,27 +0,0 @@
// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
// nowarn-no-diagnostics
// This file is purposefully encoded as windows-1252
// be careful when modifying.
//€
// expected-warning@-1 {{invalid UTF-8 in comment}}
// € ‚ƒ„…†‡ˆ‰ Š Œ Ž
// expected-warning@-1 6{{invalid UTF-8 in comment}}
/*€*/
// expected-warning@-1 {{invalid UTF-8 in comment}}
/*€ ‚ƒ„…†‡ˆ‰ Š Œ Ž*/
// expected-warning@-1 6{{invalid UTF-8 in comment}}
/*
*/
// expected-warning@-2 {{invalid UTF-8 in comment}}
// abcd
// €abcd
// expected-warning@-1 {{invalid UTF-8 in comment}}

View File

@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
unsigned getNumBytesForUTF8(UTF8 firstByte);
/*************************************************************************/

View File

@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
return isLegalUTF8(source, length);
}
/*
* Exported function to return the size of the first utf-8 code unit sequence,
* Or 0 if the sequence is not valid;
*/
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source] + 1;
return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
: 0;
}
/* --------------------------------------------------------------------- */
static unsigned