Implement delimited escape sequences.

\x{XXXX} \u{XXXX} and \o{OOOO} are accepted in all languages mode
in characters and string literals.

This is a feature proposed for both C++ (P2290R1) and C (N2785). The
papers have been seen by both committees but are not yet adopted into
either standard. However, they do have support from both committees.
This commit is contained in:
Corentin Jabot 2021-09-15 09:52:25 -04:00 committed by Aaron Ballman
parent bbca392a7f
commit 274adcb866
7 changed files with 348 additions and 42 deletions

View File

@ -127,6 +127,15 @@ def warn_utf8_symbol_zero_width : Warning<
"identifier contains Unicode character <U+%0> that is invisible in "
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
def ext_delimited_escape_sequence : Extension<
"delimited escape sequences are a Clang extension">,
InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
def err_delimited_escape_empty : Error<
"delimited escape sequence cannot be empty">;
def err_delimited_escape_missing_brace: Error<
"expected '{' after '\\%0' escape sequence">;
def err_delimited_escape_invalid : Error<
"invalid digit '%0' in escape sequence">;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
def warn_ucn_escape_no_digits : Warning<
@ -134,6 +143,12 @@ def warn_ucn_escape_no_digits : Warning<
"treating as '\\' followed by identifier">, InGroup<Unicode>;
def err_ucn_escape_incomplete : Error<
"incomplete universal character name">;
def warn_delimited_ucn_incomplete : Warning<
"incomplete delimited universal character name; "
"treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
def warn_delimited_ucn_empty : Warning<
"empty delimited universal character name; "
"treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
def warn_ucn_escape_incomplete : Warning<
"incomplete universal character name; "
"treating as '\\' followed by identifier">, InGroup<Unicode>;

View File

@ -3112,6 +3112,10 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Token *Result) {
unsigned CharSize;
char Kind = getCharAndSize(StartPtr, CharSize);
bool Delimited = false;
bool FoundEndDelimiter = false;
unsigned Count = 0;
bool Diagnose = Result && !isLexingRawMode();
unsigned NumHexDigits;
if (Kind == 'u')
@ -3122,7 +3126,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
return 0;
if (!LangOpts.CPlusPlus && !LangOpts.C99) {
if (Result && !isLexingRawMode())
if (Diagnose)
Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
return 0;
}
@ -3131,39 +3135,70 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
const char *KindLoc = &CurPtr[-1];
uint32_t CodePoint = 0;
for (unsigned i = 0; i < NumHexDigits; ++i) {
while (Count != NumHexDigits || Delimited) {
char C = getCharAndSize(CurPtr, CharSize);
if (!Delimited && C == '{') {
Delimited = true;
CurPtr += CharSize;
continue;
}
if (Delimited && C == '}') {
CurPtr += CharSize;
FoundEndDelimiter = true;
break;
}
unsigned Value = llvm::hexDigitValue(C);
if (Value == -1U) {
if (Result && !isLexingRawMode()) {
if (i == 0) {
Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
} else {
Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
// If the user wrote \U1234, suggest a fixit to \u.
if (i == 4 && NumHexDigits == 8) {
CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
Diag(KindLoc, diag::note_ucn_four_not_eight)
<< FixItHint::CreateReplacement(URange, "u");
}
}
}
if (!Delimited)
break;
if (Diagnose)
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
<< StringRef(&C, 1);
return 0;
}
if (CodePoint & 0xF000'0000) {
if (Diagnose)
Diag(KindLoc, diag::err_escape_too_large) << 0;
return 0;
}
CodePoint <<= 4;
CodePoint += Value;
CodePoint |= Value;
CurPtr += CharSize;
Count++;
}
if (Count == 0) {
if (Diagnose)
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
return 0;
}
if (!Delimited && Count != NumHexDigits) {
if (Diagnose) {
Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
// If the user wrote \U1234, suggest a fixit to \u.
if (Count == 4 && NumHexDigits == 8) {
CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
Diag(KindLoc, diag::note_ucn_four_not_eight)
<< FixItHint::CreateReplacement(URange, "u");
}
}
return 0;
}
if (Delimited && PP) {
Diag(BufferPtr, diag::ext_delimited_escape_sequence);
}
if (Result) {
Result->setFlag(Token::HasUCN);
if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)

View File

@ -95,6 +95,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
DiagnosticsEngine *Diags,
const LangOptions &Features) {
const char *EscapeBegin = ThisTokBuf;
bool Delimited = false;
bool EndDelimiterFound = false;
// Skip the '\' char.
++ThisTokBuf;
@ -143,26 +145,47 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
break;
case 'x': { // Hex escape.
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
ThisTokBuf++;
if (*ThisTokBuf == '}') {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_empty);
return ResultChar;
}
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_hex_escape_no_digits) << "x";
HadError = true;
break;
return ResultChar;
}
// Hex escapes are a maximal series of hex digits.
bool Overflow = false;
for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
if (CharVal == -1) break;
if (Delimited && *ThisTokBuf == '}') {
ThisTokBuf++;
EndDelimiterFound = true;
break;
}
int CharVal = llvm::hexDigitValue(*ThisTokBuf);
if (CharVal == -1) {
// Non delimited hex escape sequences stop at the first non-hex digit.
if (!Delimited)
break;
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_invalid)
<< StringRef(ThisTokBuf, 1);
continue;
}
// About to shift out a digit?
if (ResultChar & 0xF0000000)
Overflow = true;
ResultChar <<= 4;
ResultChar |= CharVal;
}
// See if any bits will be truncated when evaluated as a character.
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
Overflow = true;
@ -170,9 +193,13 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
}
// Check for overflow.
if (Overflow && Diags) // Too many digits to fit in
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_escape_too_large) << 0;
if (!HadError && Overflow) { // Too many digits to fit in
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_escape_too_large)
<< 0;
}
break;
}
case '0': case '1': case '2': case '3':
@ -200,7 +227,58 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
}
break;
}
case 'o': {
bool Overflow = false;
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_missing_brace);
break;
}
ResultChar = 0;
Delimited = true;
++ThisTokBuf;
if (*ThisTokBuf == '}') {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_empty);
return ResultChar;
}
while (ThisTokBuf != ThisTokEnd) {
if (*ThisTokBuf == '}') {
EndDelimiterFound = true;
ThisTokBuf++;
break;
}
if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_invalid)
<< StringRef(ThisTokBuf, 1);
ThisTokBuf++;
continue;
}
if (ResultChar & 0x020000000)
Overflow = true;
ResultChar <<= 3;
ResultChar |= *ThisTokBuf++ - '0';
}
// Check for overflow. Reject '\777', but not L'\777'.
if (!HadError &&
(Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_escape_too_large)
<< 1;
ResultChar &= ~0U >> (32 - CharWidth);
}
break;
}
// Otherwise, these are not valid escapes.
case '(': case '{': case '[': case '%':
// GCC accepts these as extensions. We warn about them as such though.
@ -224,6 +302,17 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
break;
}
if (Delimited && Diags) {
if (!EndDelimiterFound)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_expected)
<< tok::r_brace;
else if (!HadError) {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::ext_delimited_escape_sequence);
}
}
return ResultChar;
}
@ -245,18 +334,32 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
}
++I;
assert(*I == 'u' || *I == 'U');
char Kind = *I;
++I;
assert(Kind == 'u' || Kind == 'U');
uint32_t CodePoint = 0;
if (Kind == 'u' && *I == '{') {
for (++I; *I != '}'; ++I) {
unsigned Value = llvm::hexDigitValue(*I);
assert(Value != -1U);
CodePoint <<= 4;
CodePoint += Value;
}
appendCodePoint(CodePoint, Buf);
continue;
}
unsigned NumHexDigits;
if (*I == 'u')
if (Kind == 'u')
NumHexDigits = 4;
else
NumHexDigits = 8;
assert(I + NumHexDigits <= E);
uint32_t CodePoint = 0;
for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
for (; NumHexDigits != 0; ++I, --NumHexDigits) {
unsigned Value = llvm::hexDigitValue(*I);
assert(Value != -1U);
@ -282,28 +385,82 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
// Skip the '\u' char's.
ThisTokBuf += 2;
if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
bool Delimited = false;
bool EndDelimiterFound = false;
bool HasError = false;
if (UcnBegin[1] == 'u' && in_char_string_literal &&
ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
ThisTokBuf++;
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
return false;
}
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
unsigned short UcnLenSave = UcnLen;
for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
if (CharVal == -1) break;
bool Overflow = false;
unsigned short Count = 0;
for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
++ThisTokBuf) {
if (Delimited && *ThisTokBuf == '}') {
++ThisTokBuf;
EndDelimiterFound = true;
break;
}
int CharVal = llvm::hexDigitValue(*ThisTokBuf);
if (CharVal == -1) {
HasError = true;
if (!Delimited)
break;
if (Diags) {
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_delimited_escape_invalid)
<< StringRef(ThisTokBuf, 1);
}
Count++;
continue;
}
if (UcnVal & 0xF0000000) {
Overflow = true;
continue;
}
UcnVal <<= 4;
UcnVal |= CharVal;
Count++;
}
// If we didn't consume the proper number of digits, there is a problem.
if (UcnLenSave) {
if (Overflow) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_ucn_escape_incomplete);
diag::err_escape_too_large)
<< 0;
return false;
}
if (Delimited && !EndDelimiterFound) {
if (Diags) {
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_expected)
<< tok::r_brace;
}
return false;
}
// If we didn't consume the proper number of digits, there is a problem.
if (Count == 0 || (!Delimited && Count != UcnLen)) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
Delimited ? diag::err_delimited_escape_empty
: diag::err_ucn_escape_incomplete);
return false;
}
if (HasError)
return false;
// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
UcnVal > 0x10FFFF) { // maximum legal UTF32 value
@ -338,6 +495,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::warn_ucn_not_valid_in_c89_literal);
if (Delimited && Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::ext_delimited_escape_sequence);
return true;
}

View File

@ -0,0 +1,81 @@
// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
const char *errors =
"\u{}" //expected-error {{delimited escape sequence cannot be empty}}
"\u{" //expected-error {{expected '}'}}
"\u{h}" //expected-error {{invalid digit 'h' in escape sequence}}
"\x{}" //expected-error {{delimited escape sequence cannot be empty}}
"\x{" //expected-error {{expected '}'}}
"\x{h}" //expected-error {{invalid digit 'h' in escape sequence}}
"\o{}" //expected-error {{delimited escape sequence cannot be empty}}
"\o{" //expected-error {{expected '}'}}
"\o{8}" //expected-error {{invalid digit '8' in escape sequence}}
;
void ucn() {
char a = '\u{1234}'; // expected-error {{character too large for enclosing character literal type}}
// expected-warning@-1 {{delimited escape sequences are a Clang extension}}
unsigned b = U'\u{1234}'; // expected-warning {{extension}}
#ifdef __cplusplus
unsigned b2 = U'\u{1}'; // expected-warning {{extension}}
#else
unsigned b2 = U'\u{1}'; //expected-error {{universal character name refers to a control character}}
#endif
unsigned c = U'\u{000000000001234}'; // expected-warning {{extension}}
unsigned d = U'\u{111111111}'; //expected-error {{hex escape sequence out of range}}
}
void hex() {
char a = '\x{1}'; // expected-warning {{extension}}
char b = '\x{abcdegggggabc}'; // expected-error 5{{invalid digit 'g' in escape sequence}}
char c = '\x{ff1}'; // expected-error {{hex escape sequence out of range}}
#if __WCHAR_MAX__ > 0xFFFF
unsigned d = L'\x{FFFFFFFF}'; // expected-warning {{extension}}
unsigned e = L'\x{100000000}'; // expected-error {{hex escape sequence out of range}}
#else
unsigned f = L'\x{FFFF}'; // expected-warning {{extension}}
unsigned g = L'\x{10000}'; // expected-error {{hex escape sequence out of range}}
#endif
unsigned h = U'\x{FFFFFFFF}'; // expected-warning {{extension}}
unsigned i = U'\x{100000000}'; // expected-error {{hex escape sequence out of range}}
}
void octal() {
char a = '\o{1}'; // expected-warning {{extension}}
char b = '\o{12345678881238}'; // expected-error 4{{invalid digit '8' in escape sequence}}
char c = '\o{777}'; // //expected-error {{octal escape sequence out of range}}
#if __WCHAR_MAX__ > 0xFFFF
unsigned d = L'\o{37777777777}'; // expected-warning {{extension}}
unsigned e = L'\o{40000000000}'; // expected-error {{octal escape sequence out of range}}
#else
unsigned d = L'\o{177777}'; // expected-warning {{extension}}
unsigned e = L'\o{200000}'; // expected-error {{octal escape sequence out of range}}
#endif
}
void concat() {
(void)"\x{" "12}"; // expected-error {{expected '}'}}
(void)"\u{" "12}"; // expected-error {{expected '}'}}
(void)"\o{" "12}"; // expected-error {{expected '}'}}
(void)"\x{12" "}"; // expected-error {{expected '}'}}
(void)"\u{12" "}"; // expected-error {{expected '}'}}
(void)"\o{12" "}"; // expected-error {{expected '}'}}
}
void separators() {
(void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
(void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
(void)"\o{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
'\x{12'3'}'; // expected-error {{expected '}'}}
// expected-error@-1 2{{expected ';'}}
// expected-warning@-2 3{{expression result unused}}
}

View File

@ -129,6 +129,9 @@ int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator
int operator""_""_\u212e""_\U0000212e""(const char*, size_t);
int operator""_\u212e""_\U0000212e""_""(const char*, size_t);
int operator""_\U0000212e""_""_\u212e""(const char*, size_t);
int operator""_\u{212f}(char);
int mix_ucn_utf8 = ""_""_\u212e""_\U0000212e"";
void operator""_""_(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_' and '_') in string literal concatenation}}

View File

@ -16,6 +16,10 @@
#error "This should never happen"
#endif
#if a\u{FD}() //expected-warning {{Clang extension}}
#error "This should never happen"
#endif
#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
#endif
#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
@ -27,6 +31,7 @@
#define \ufffe // expected-error {{macro name must be an identifier}}
#define \U10000000 // expected-error {{macro name must be an identifier}}
#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
#define a\u0024
@ -103,3 +108,8 @@ C 1
// CHECK-NEXT: #define capital_u_\U00FC
// CHECK-NEXT: {{^ \^}}
// CHECK-NEXT: {{^ u}}
#define \u{} // expected-warning {{empty delimited universal character name; treating as '\' 'u' '{' '}'}} expected-error {{macro name must be an identifier}}
#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}

View File

@ -17,6 +17,7 @@ void goodCalls() {
\u00fcber(1);
über(2);
\U000000FCber(3);
\u{FC}ber(4); // expected-warning {{Clang extension}}
}
void badCalls() {
@ -24,7 +25,7 @@ void badCalls() {
\u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}}
über(1, 2);
\U000000FCber();
\U000000FCber();
#ifdef __cplusplus
// expected-error@-3 {{no matching function}}
// expected-error@-3 {{no matching function}}