forked from OSchip/llvm-project
Revert "[Clang] Add a warning on invalid UTF-8 in comments."
Reverting while I investigate build failures
This reverts commit e3dc56805f
.
This commit is contained in:
parent
23c2bedfd9
commit
fb06dd3e8c
|
@ -279,8 +279,6 @@ Improvements to Clang's diagnostics
|
||||||
unevaluated operands of a ``typeid`` expression, as they are now
|
unevaluated operands of a ``typeid`` expression, as they are now
|
||||||
modeled correctly in the CFG. This fixes
|
modeled correctly in the CFG. This fixes
|
||||||
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
|
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
|
||||||
- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
|
|
||||||
comments.
|
|
||||||
|
|
||||||
Non-comprehensive list of changes in this release
|
Non-comprehensive list of changes in this release
|
||||||
-------------------------------------------------
|
-------------------------------------------------
|
||||||
|
@ -578,7 +576,7 @@ AST Matchers
|
||||||
|
|
||||||
- Added ``forEachTemplateArgument`` matcher which creates a match every
|
- Added ``forEachTemplateArgument`` matcher which creates a match every
|
||||||
time a ``templateArgument`` matches the matcher supplied to it.
|
time a ``templateArgument`` matches the matcher supplied to it.
|
||||||
|
|
||||||
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
|
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
|
||||||
literal expressions.
|
literal expressions.
|
||||||
|
|
||||||
|
|
|
@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
|
||||||
// Unicode and UCNs
|
// Unicode and UCNs
|
||||||
def err_invalid_utf8 : Error<
|
def err_invalid_utf8 : Error<
|
||||||
"source file is not valid UTF-8">;
|
"source file is not valid UTF-8">;
|
||||||
def warn_invalid_utf8_in_comment : Extension<
|
|
||||||
"invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
|
|
||||||
def err_character_not_allowed : Error<
|
def err_character_not_allowed : Error<
|
||||||
"unexpected character <U+%0>">;
|
"unexpected character <U+%0>">;
|
||||||
def err_character_not_allowed_identifier : Error<
|
def err_character_not_allowed_identifier : Error<
|
||||||
|
|
|
@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
|
||||||
//
|
//
|
||||||
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
|
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
|
||||||
// character that ends the line comment.
|
// character that ends the line comment.
|
||||||
|
|
||||||
// C++23 [lex.phases] p1
|
|
||||||
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
|
|
||||||
// diagnostic only once per entire ill-formed subsequence to avoid
|
|
||||||
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
|
|
||||||
bool UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
|
|
||||||
char C;
|
char C;
|
||||||
while (true) {
|
while (true) {
|
||||||
C = *CurPtr;
|
C = *CurPtr;
|
||||||
// Skip over characters in the fast loop.
|
// Skip over characters in the fast loop.
|
||||||
while (isASCII(C) && C != 0 && // Potentially EOF.
|
while (C != 0 && // Potentially EOF.
|
||||||
C != '\n' && C != '\r') { // Newline or DOS-style newline.
|
C != '\n' && C != '\r') // Newline or DOS-style newline.
|
||||||
C = *++CurPtr;
|
C = *++CurPtr;
|
||||||
UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isASCII(C)) {
|
|
||||||
unsigned Length = llvm::getUTF8SequenceSize(
|
|
||||||
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
|
|
||||||
if (Length == 0) {
|
|
||||||
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
|
|
||||||
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
|
|
||||||
UnicodeDecodingAlreadyDiagnosed = true;
|
|
||||||
++CurPtr;
|
|
||||||
} else {
|
|
||||||
UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
CurPtr += Length;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *NextLine = CurPtr;
|
const char *NextLine = CurPtr;
|
||||||
if (C != 0) {
|
if (C != 0) {
|
||||||
|
@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
||||||
if (C == '/')
|
if (C == '/')
|
||||||
C = *CurPtr++;
|
C = *CurPtr++;
|
||||||
|
|
||||||
// C++23 [lex.phases] p1
|
|
||||||
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
|
|
||||||
// diagnostic only once per entire ill-formed subsequence to avoid
|
|
||||||
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
|
|
||||||
bool UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Skip over all non-interesting characters until we find end of buffer or a
|
// Skip over all non-interesting characters until we find end of buffer or a
|
||||||
// (probably ending) '/' character.
|
// (probably ending) '/' character.
|
||||||
|
@ -2703,24 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
||||||
// doesn't check for '\0'.
|
// doesn't check for '\0'.
|
||||||
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
|
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
|
||||||
// While not aligned to a 16-byte boundary.
|
// While not aligned to a 16-byte boundary.
|
||||||
while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
|
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
|
||||||
if (!isASCII(C)) {
|
|
||||||
CurPtr--;
|
|
||||||
goto MultiByteUTF8;
|
|
||||||
}
|
|
||||||
C = *CurPtr++;
|
C = *CurPtr++;
|
||||||
}
|
|
||||||
if (C == '/') goto FoundSlash;
|
if (C == '/') goto FoundSlash;
|
||||||
|
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
__m128i Slashes = _mm_set1_epi8('/');
|
__m128i Slashes = _mm_set1_epi8('/');
|
||||||
while (CurPtr + 16 < BufferEnd) {
|
while (CurPtr+16 <= BufferEnd) {
|
||||||
int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
|
|
||||||
if (LLVM_UNLIKELY(Mask != 0)) {
|
|
||||||
CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
|
|
||||||
goto MultiByteUTF8;
|
|
||||||
}
|
|
||||||
// look for slashes
|
|
||||||
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
|
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
|
||||||
Slashes));
|
Slashes));
|
||||||
if (cmp != 0) {
|
if (cmp != 0) {
|
||||||
|
@ -2733,41 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
||||||
CurPtr += 16;
|
CurPtr += 16;
|
||||||
}
|
}
|
||||||
#elif __ALTIVEC__
|
#elif __ALTIVEC__
|
||||||
__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
||||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
||||||
0x80, 0x80, 0x80, 0x80};
|
|
||||||
__vector unsigned char Slashes = {
|
__vector unsigned char Slashes = {
|
||||||
'/', '/', '/', '/', '/', '/', '/', '/',
|
'/', '/', '/', '/', '/', '/', '/', '/',
|
||||||
'/', '/', '/', '/', '/', '/', '/', '/'
|
'/', '/', '/', '/', '/', '/', '/', '/'
|
||||||
};
|
};
|
||||||
while (CurPtr + 16 < BufferEnd) {
|
while (CurPtr + 16 <= BufferEnd &&
|
||||||
if (LLVM_UNLIKELY(
|
!vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
|
||||||
vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
|
|
||||||
goto MultiByteUTF8;
|
|
||||||
if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
|
|
||||||
C = *CurPtr++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
CurPtr += 16;
|
CurPtr += 16;
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
while (CurPtr + 16 <= BufferEnd) {
|
// Scan for '/' quickly. Many block comments are very large.
|
||||||
bool HasNonASCII = false;
|
while (CurPtr[0] != '/' &&
|
||||||
for (unsigned I = 0; I < 16; ++I) {
|
CurPtr[1] != '/' &&
|
||||||
HasNonASCII |= !isASCII(CurPtr[I]);
|
CurPtr[2] != '/' &&
|
||||||
}
|
CurPtr[3] != '/' &&
|
||||||
|
CurPtr+4 < BufferEnd) {
|
||||||
if (LLVM_UNLIKELY(HasNonASCII))
|
CurPtr += 4;
|
||||||
goto MultiByteUTF8;
|
|
||||||
|
|
||||||
bool HasSlash = false;
|
|
||||||
for (unsigned I = 0; I < 16; ++I) {
|
|
||||||
HasSlash |= CurPtr[I] == '/';
|
|
||||||
}
|
|
||||||
if (HasSlash)
|
|
||||||
break;
|
|
||||||
CurPtr += 16;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2775,28 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
|
||||||
C = *CurPtr++;
|
C = *CurPtr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop to scan the remainder, warning on invalid UTF-8
|
// Loop to scan the remainder.
|
||||||
// if the corresponding warning is enabled, emitting a diagnostic only once
|
while (C != '/' && C != '\0')
|
||||||
// per sequence that cannot be decoded.
|
C = *CurPtr++;
|
||||||
while (C != '/' && C != '\0') {
|
|
||||||
if (isASCII(C)) {
|
|
||||||
UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
C = *CurPtr++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
MultiByteUTF8:
|
|
||||||
unsigned Length = llvm::getUTF8SequenceSize(
|
|
||||||
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
|
|
||||||
if (Length == 0) {
|
|
||||||
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
|
|
||||||
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
|
|
||||||
UnicodeDecodingAlreadyDiagnosed = true;
|
|
||||||
C = *CurPtr++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
UnicodeDecodingAlreadyDiagnosed = false;
|
|
||||||
C = *(CurPtr += Length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (C == '/') {
|
if (C == '/') {
|
||||||
FoundSlash:
|
FoundSlash:
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
|
|
||||||
// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
|
|
||||||
// nowarn-no-diagnostics
|
|
||||||
|
|
||||||
// This file is purposefully encoded as windows-1252
|
|
||||||
// be careful when modifying.
|
|
||||||
|
|
||||||
//€
|
|
||||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
|
||||||
|
|
||||||
// € ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž
|
|
||||||
// expected-warning@-1 6{{invalid UTF-8 in comment}}
|
|
||||||
|
|
||||||
/*€*/
|
|
||||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
|
||||||
|
|
||||||
/*€ ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž*/
|
|
||||||
// expected-warning@-1 6{{invalid UTF-8 in comment}}
|
|
||||||
|
|
||||||
/*
|
|
||||||
€
|
|
||||||
*/
|
|
||||||
// expected-warning@-2 {{invalid UTF-8 in comment}}
|
|
||||||
|
|
||||||
// abcd
|
|
||||||
// €abcd
|
|
||||||
// expected-warning@-1 {{invalid UTF-8 in comment}}
|
|
|
@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
|
||||||
|
|
||||||
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
|
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
|
||||||
|
|
||||||
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
|
|
||||||
|
|
||||||
unsigned getNumBytesForUTF8(UTF8 firstByte);
|
unsigned getNumBytesForUTF8(UTF8 firstByte);
|
||||||
|
|
||||||
/*************************************************************************/
|
/*************************************************************************/
|
||||||
|
|
|
@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
|
||||||
return isLegalUTF8(source, length);
|
return isLegalUTF8(source, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Exported function to return the size of the first utf-8 code unit sequence,
|
|
||||||
* Or 0 if the sequence is not valid;
|
|
||||||
*/
|
|
||||||
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
|
|
||||||
int length = trailingBytesForUTF8[*source] + 1;
|
|
||||||
return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
|
|
||||||
: 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* --------------------------------------------------------------------- */
|
/* --------------------------------------------------------------------- */
|
||||||
|
|
||||||
static unsigned
|
static unsigned
|
||||||
|
|
Loading…
Reference in New Issue