forked from OSchip/llvm-project
Revert "[Clang] Add a warning on invalid UTF-8 in comments."
Reverting while I investigate build failures
This reverts commit e3dc56805f
.
This commit is contained in:
parent
23c2bedfd9
commit
fb06dd3e8c
|
@ -279,8 +279,6 @@ Improvements to Clang's diagnostics
|
|||
unevaluated operands of a ``typeid`` expression, as they are now
|
||||
modeled correctly in the CFG. This fixes
|
||||
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
|
||||
- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
|
||||
comments.
|
||||
|
||||
Non-comprehensive list of changes in this release
|
||||
-------------------------------------------------
|
||||
|
@ -578,7 +576,7 @@ AST Matchers
|
|||
|
||||
- Added ``forEachTemplateArgument`` matcher which creates a match every
|
||||
time a ``templateArgument`` matches the matcher supplied to it.
|
||||
|
||||
|
||||
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
|
||||
literal expressions.
|
||||
|
||||
|
|
|
@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
|
|||
// Unicode and UCNs
|
||||
def err_invalid_utf8 : Error<
|
||||
"source file is not valid UTF-8">;
|
||||
def warn_invalid_utf8_in_comment : Extension<
|
||||
"invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
|
||||
def err_character_not_allowed : Error<
|
||||
"unexpected character <U+%0>">;
|
||||
def err_character_not_allowed_identifier : Error<
|
||||
|
|
|
@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
|
|||
//
|
||||
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
|
||||
// character that ends the line comment.
|
||||
|
||||
// C++23 [lex.phases] p1
|
||||
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
|
||||
// diagnostic only once per entire ill-formed subsequence to avoid
|
||||
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
|
||||
bool UnicodeDecodingAlreadyDiagnosed = false;
|
||||
|
||||
char C;
|
||||
while (true) {
|
||||
C = *CurPtr;
|
||||
// Skip over characters in the fast loop.
|
||||
while (isASCII(C) && C != 0 && // Potentially EOF.
|
||||
C != '\n' && C != '\r') { // Newline or DOS-style newline.
|
||||
while (C != 0 && // Potentially EOF.
|
||||
C != '\n' && C != '\r') // Newline or DOS-style newline.
|
||||
C = *++CurPtr;
|
||||
UnicodeDecodingAlreadyDiagnosed = false;
|
||||
}
|
||||
|
||||
if (!isASCII(C)) {
|
||||
unsigned Length = llvm::getUTF8SequenceSize(
|
||||
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
|
||||
if (Length == 0) {
|
||||
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
|
||||
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
|
||||
UnicodeDecodingAlreadyDiagnosed = true;
|
||||
++CurPtr;
|
||||
} else {
|
||||
UnicodeDecodingAlreadyDiagnosed = false;
|
||||
CurPtr += Length;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const char *NextLine = CurPtr;
|
||||
if (C != 0) {
|
||||
|
@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
|||
if (C == '/')
|
||||
C = *CurPtr++;
|
||||
|
||||
// C++23 [lex.phases] p1
|
||||
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
|
||||
// diagnostic only once per entire ill-formed subsequence to avoid
|
||||
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
|
||||
bool UnicodeDecodingAlreadyDiagnosed = false;
|
||||
|
||||
while (true) {
|
||||
// Skip over all non-interesting characters until we find end of buffer or a
|
||||
// (probably ending) '/' character.
|
||||
|
@ -2703,24 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
|||
// doesn't check for '\0'.
|
||||
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
|
||||
// While not aligned to a 16-byte boundary.
|
||||
while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
|
||||
if (!isASCII(C)) {
|
||||
CurPtr--;
|
||||
goto MultiByteUTF8;
|
||||
}
|
||||
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
|
||||
C = *CurPtr++;
|
||||
}
|
||||
|
||||
if (C == '/') goto FoundSlash;
|
||||
|
||||
#ifdef __SSE2__
|
||||
__m128i Slashes = _mm_set1_epi8('/');
|
||||
while (CurPtr + 16 < BufferEnd) {
|
||||
int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
|
||||
if (LLVM_UNLIKELY(Mask != 0)) {
|
||||
CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
|
||||
goto MultiByteUTF8;
|
||||
}
|
||||
// look for slashes
|
||||
while (CurPtr+16 <= BufferEnd) {
|
||||
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
|
||||
Slashes));
|
||||
if (cmp != 0) {
|
||||
|
@ -2733,41 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
|||
CurPtr += 16;
|
||||
}
|
||||
#elif __ALTIVEC__
|
||||
__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80};
|
||||
__vector unsigned char Slashes = {
|
||||
'/', '/', '/', '/', '/', '/', '/', '/',
|
||||
'/', '/', '/', '/', '/', '/', '/', '/'
|
||||
};
|
||||
while (CurPtr + 16 < BufferEnd) {
|
||||
if (LLVM_UNLIKELY(
|
||||
vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
|
||||
goto MultiByteUTF8;
|
||||
if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
|
||||
C = *CurPtr++;
|
||||
break;
|
||||
}
|
||||
while (CurPtr + 16 <= BufferEnd &&
|
||||
!vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
|
||||
CurPtr += 16;
|
||||
}
|
||||
|
||||
#else
|
||||
while (CurPtr + 16 <= BufferEnd) {
|
||||
bool HasNonASCII = false;
|
||||
for (unsigned I = 0; I < 16; ++I) {
|
||||
HasNonASCII |= !isASCII(CurPtr[I]);
|
||||
}
|
||||
|
||||
if (LLVM_UNLIKELY(HasNonASCII))
|
||||
goto MultiByteUTF8;
|
||||
|
||||
bool HasSlash = false;
|
||||
for (unsigned I = 0; I < 16; ++I) {
|
||||
HasSlash |= CurPtr[I] == '/';
|
||||
}
|
||||
if (HasSlash)
|
||||
break;
|
||||
CurPtr += 16;
|
||||
// Scan for '/' quickly. Many block comments are very large.
|
||||
while (CurPtr[0] != '/' &&
|
||||
CurPtr[1] != '/' &&
|
||||
CurPtr[2] != '/' &&
|
||||
CurPtr[3] != '/' &&
|
||||
CurPtr+4 < BufferEnd) {
|
||||
CurPtr += 4;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -2775,28 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
|||
C = *CurPtr++;
|
||||
}
|
||||
|
||||
// Loop to scan the remainder, warning on invalid UTF-8
|
||||
// if the corresponding warning is enabled, emitting a diagnostic only once
|
||||
// per sequence that cannot be decoded.
|
||||
while (C != '/' && C != '\0') {
|
||||
if (isASCII(C)) {
|
||||
UnicodeDecodingAlreadyDiagnosed = false;
|
||||
C = *CurPtr++;
|
||||
continue;
|
||||
}
|
||||
MultiByteUTF8:
|
||||
unsigned Length = llvm::getUTF8SequenceSize(
|
||||
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
|
||||
if (Length == 0) {
|
||||
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
|
||||
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
|
||||
UnicodeDecodingAlreadyDiagnosed = true;
|
||||
C = *CurPtr++;
|
||||
continue;
|
||||
}
|
||||
UnicodeDecodingAlreadyDiagnosed = false;
|
||||
C = *(CurPtr += Length - 1);
|
||||
}
|
||||
// Loop to scan the remainder.
|
||||
while (C != '/' && C != '\0')
|
||||
C = *CurPtr++;
|
||||
|
||||
if (C == '/') {
|
||||
FoundSlash:
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
|
||||
// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
|
||||
// nowarn-no-diagnostics
|
||||
|
||||
// This file is purposefully encoded as windows-1252
|
||||
// be careful when modifying.
|
||||
|
||||
//€
|
||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
||||
|
||||
// € ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž
|
||||
// expected-warning@-1 6{{invalid UTF-8 in comment}}
|
||||
|
||||
/*€*/
|
||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
||||
|
||||
/*€ ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž*/
|
||||
// expected-warning@-1 6{{invalid UTF-8 in comment}}
|
||||
|
||||
/*
|
||||
€
|
||||
*/
|
||||
// expected-warning@-2 {{invalid UTF-8 in comment}}
|
||||
|
||||
// abcd
|
||||
// €abcd
|
||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
|
@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
|
|||
|
||||
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
|
||||
|
||||
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
|
||||
|
||||
unsigned getNumBytesForUTF8(UTF8 firstByte);
|
||||
|
||||
/*************************************************************************/
|
||||
|
|
|
@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
|
|||
return isLegalUTF8(source, length);
|
||||
}
|
||||
|
||||
/*
|
||||
* Exported function to return the size of the first utf-8 code unit sequence,
|
||||
* Or 0 if the sequence is not valid;
|
||||
*/
|
||||
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
|
||||
int length = trailingBytesForUTF8[*source] + 1;
|
||||
return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
|
||||
: 0;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
static unsigned
|
||||
|
|
Loading…
Reference in New Issue