PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.

llvm-svn: 201532
This commit is contained in:
Richard Smith 2014-02-17 21:52:30 +00:00
parent 6287371ce6
commit 8b7258bdb3
6 changed files with 217 additions and 117 deletions

View File

@ -614,8 +614,28 @@ private:
/// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
/// invalid.
uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
};
/// \brief Try to consume a UCN as part of an identifier at the current
/// location.
/// \param CurPtr Initially points to the range of characters in the source
/// buffer containing the '\'. Updated to point past the end of
/// the UCN on success.
/// \param Size The number of characters occupied by the '\' (including
/// trigraphs and escaped newlines).
/// \param Result The token being produced. Marked as containing a UCN on
/// success.
/// \return \c true if a UCN was lexed and it produced an acceptable
/// identifier character, \c false otherwise.
bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
Token &Result);
/// \brief Try to consume an identifier character encoded in UTF-8.
/// \param CurPtr Points to the start of the (potential) UTF-8 code unit
/// sequence. On success, updated to point past the end of it.
/// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
/// character was lexed, \c false otherwise.
bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
};
} // end namespace clang

View File

@ -33,6 +33,9 @@ class TargetInfo;
class SourceManager;
class LangOptions;
/// Copy characters from Input to Buf, expanding any UCNs.
void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input);
/// NumericLiteralParser - This performs strict semantic analysis of the content
/// of a ppnumber, classifying it as either integer, floating, or erroneous,
/// determines the radix of the value and can convert it to a useful value.
@ -48,6 +51,8 @@ class NumericLiteralParser {
bool saw_exponent, saw_period, saw_ud_suffix;
SmallString<32> UDSuffixBuf;
public:
NumericLiteralParser(StringRef TokSpelling,
SourceLocation TokLoc,
@ -72,7 +77,7 @@ public:
}
StringRef getUDSuffix() const {
assert(saw_ud_suffix);
return StringRef(SuffixBegin, ThisTokEnd - SuffixBegin);
return UDSuffixBuf;
}
unsigned getUDSuffixOffset() const {
assert(saw_ud_suffix);

View File

@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
<< Range;
}
}
}
}
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
Token &Result) {
const char *UCNPtr = CurPtr + Size;
uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
return false;
if (!isLexingRawMode())
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);
Result.setFlag(Token::HasUCN);
if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
(UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
CurPtr = UCNPtr;
else
while (CurPtr != UCNPtr)
(void)getAndAdvanceChar(CurPtr, Result);
return true;
}
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
const char *UnicodePtr = CurPtr;
UTF32 CodePoint;
ConversionResult Result =
llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
(const UTF8 *)BufferEnd,
&CodePoint,
strictConversion);
if (Result != conversionOK ||
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
return false;
if (!isLexingRawMode())
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
CurPtr = UnicodePtr;
return true;
}
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@ -1500,47 +1543,10 @@ FinishIdentifier:
C = getCharAndSize(CurPtr, Size);
continue;
} else if (C == '\\') {
const char *UCNPtr = CurPtr + Size;
uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
goto FinishIdentifier;
if (!isLexingRawMode()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);
}
Result.setFlag(Token::HasUCN);
if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
(UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
CurPtr = UCNPtr;
else
while (CurPtr != UCNPtr)
(void)getAndAdvanceChar(CurPtr, Result);
} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
C = getCharAndSize(CurPtr, Size);
continue;
} else if (!isASCII(C)) {
const char *UnicodePtr = CurPtr;
UTF32 CodePoint;
ConversionResult Result =
llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
(const UTF8 *)BufferEnd,
&CodePoint,
strictConversion);
if (Result != conversionOK ||
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
goto FinishIdentifier;
if (!isLexingRawMode()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
}
CurPtr = UnicodePtr;
} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
C = getCharAndSize(CurPtr, Size);
continue;
} else if (!isIdentifierBody(C)) {
@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
while (isPreprocessingNumberBody(C)) {
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);
@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
}
}
// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
return LexNumericConstant(Result, CurPtr);
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
return LexNumericConstant(Result, CurPtr);
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
bool IsStringLiteral) {
assert(getLangOpts().CPlusPlus);
// Maximally munch an identifier. FIXME: UCNs.
// Maximally munch an identifier.
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
if (isIdentifierHead(C)) {
if (!getLangOpts().CPlusPlus11) {
if (!isLexingRawMode())
Diag(CurPtr,
C == '_' ? diag::warn_cxx11_compat_user_defined_literal
: diag::warn_cxx11_compat_reserved_user_defined_literal)
<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
return CurPtr;
}
bool Consumed = false;
// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
// that does not start with an underscore is ill-formed. As a conforming
// extension, we treat all such suffixes as if they had whitespace before
// them.
if (!isIdentifierHead(C)) {
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
Consumed = true;
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
Consumed = true;
else
return CurPtr;
}
if (!getLangOpts().CPlusPlus11) {
if (!isLexingRawMode())
Diag(CurPtr,
C == '_' ? diag::warn_cxx11_compat_user_defined_literal
: diag::warn_cxx11_compat_reserved_user_defined_literal)
<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
return CurPtr;
}
// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
// that does not start with an underscore is ill-formed. As a conforming
// extension, we treat all such suffixes as if they had whitespace before
// them. We assume a suffix beginning with a UCN or UTF-8 character is more
// likely to be a ud-suffix than a macro, however, and accept that.
if (!Consumed) {
bool IsUDSuffix = false;
if (C == '_')
IsUDSuffix = true;
@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
Diag(CurPtr, getLangOpts().MSVCCompat
? diag::ext_ms_reserved_user_defined_literal
: diag::ext_reserved_user_defined_literal)
<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
return CurPtr;
}
Result.setFlag(Token::HasUDSuffix);
do {
CurPtr = ConsumeChar(CurPtr, Size, Result);
C = getCharAndSize(CurPtr, Size);
} while (isIdentifierBody(C));
CurPtr = ConsumeChar(CurPtr, Size, Result);
}
Result.setFlag(Token::HasUDSuffix);
while (true) {
C = getCharAndSize(CurPtr, Size);
if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
else break;
}
return CurPtr;
}

View File

@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
return ResultChar;
}
static void appendCodePoint(unsigned Codepoint,
llvm::SmallVectorImpl<char> &Str) {
char ResultBuf[4];
char *ResultPtr = ResultBuf;
bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
(void)Res;
assert(Res && "Unexpected conversion failure");
Str.append(ResultBuf, ResultPtr);
}
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
if (*I != '\\') {
Buf.push_back(*I);
continue;
}
++I;
assert(*I == 'u' || *I == 'U');
unsigned NumHexDigits;
if (*I == 'u')
NumHexDigits = 4;
else
NumHexDigits = 8;
assert(I + NumHexDigits <= E);
uint32_t CodePoint = 0;
for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
unsigned Value = llvm::hexDigitValue(*I);
assert(Value != -1U);
CodePoint <<= 4;
CodePoint += Value;
}
appendCodePoint(CodePoint, Buf);
--I;
}
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
}
if (s != ThisTokEnd) {
if (isValidUDSuffix(PP.getLangOpts(),
StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) {
// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
// Any suffix pieces we might have parsed are actually part of the
// ud-suffix.
isLong = false;
@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
do {
--end;
} while (end[-1] != '\'');
UDSuffixBuf.assign(end, UDSuffixEnd);
// FIXME: Don't bother with this if !tok.hasUCN().
expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
UDSuffixOffset = end - TokBegin;
}
@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
if (UDSuffixBuf.empty()) {
UDSuffixBuf.assign(UDSuffix);
if (StringToks[i].hasUCN())
expandUCNs(UDSuffixBuf, UDSuffix);
else
UDSuffixBuf.assign(UDSuffix);
UDSuffixToken = i;
UDSuffixOffset = ThisTokEnd - ThisTokBuf;
UDSuffixTokLoc = StringToks[i].getLocation();
} else if (!UDSuffixBuf.equals(UDSuffix)) {
} else {
SmallString<32> ExpandedUDSuffix;
if (StringToks[i].hasUCN()) {
expandUCNs(ExpandedUDSuffix, UDSuffix);
UDSuffix = ExpandedUDSuffix;
}
// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
// result of a concatenation involving at least one user-defined-string-
// literal, all the participating user-defined-string-literals shall
// have the same ud-suffix.
if (Diags) {
SourceLocation TokLoc = StringToks[i].getLocation();
Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
<< UDSuffixBuf << UDSuffix
<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
<< SourceRange(TokLoc, TokLoc);
if (!UDSuffixBuf.equals(UDSuffix)) {
if (Diags) {
SourceLocation TokLoc = StringToks[i].getLocation();
Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
<< UDSuffixBuf << UDSuffix
<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
<< SourceRange(TokLoc, TokLoc);
}
hadError = true;
}
hadError = true;
}
}

View File

@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() {
// Lexer Event Handling.
//===----------------------------------------------------------------------===//
static void appendCodePoint(unsigned Codepoint,
llvm::SmallVectorImpl<char> &Str) {
char ResultBuf[4];
char *ResultPtr = ResultBuf;
bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
(void)Res;
assert(Res && "Unexpected conversion failure");
Str.append(ResultBuf, ResultPtr);
}
static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
if (*I != '\\') {
Buf.push_back(*I);
continue;
}
++I;
assert(*I == 'u' || *I == 'U');
unsigned NumHexDigits;
if (*I == 'u')
NumHexDigits = 4;
else
NumHexDigits = 8;
assert(I + NumHexDigits <= E);
uint32_t CodePoint = 0;
for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
unsigned Value = llvm::hexDigitValue(*I);
assert(Value != -1U);
CodePoint <<= 4;
CodePoint += Value;
}
appendCodePoint(CodePoint, Buf);
--I;
}
}
/// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
/// identifier information for the token and install it into the token,
/// updating the token kind accordingly.

View File

@ -111,3 +111,35 @@ void operator "" ""
U"" // expected-error {{cannot have an encoding prefix}}
"" _also_not_char(const char *);
void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}}
// Make sure we treat UCNs and UTF-8 as equivalent.
int operator""_µs(unsigned long long) {} // expected-note {{previous}}
int hundred_µs = 50_µs + 50_\u00b5s;
int operator""_\u00b5s(unsigned long long) {} // expected-error {{redefinition of 'operator "" _µs'}}
int operator""_\U0000212B(long double) {} // expected-note {{previous}}
int hundred_Å = 50.0_Å + 50._\U0000212B;
int operator""_Å(long double) {} // expected-error {{redefinition of 'operator "" _Å'}}
int operator""_𐀀(char) {} // expected-note {{previous}}
int 𐀀 = '4'_𐀀 + '2'_\U00010000;
int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator "" _𐀀'}}
// These all declare the same function.
int operator""_""_\u212e""_\U0000212e""(const char*, size_t);
int operator""_\u212e""_\U0000212e""_""(const char*, size_t);
int operator""_\U0000212e""_""_\u212e""(const char*, size_t);
int mix_ucn_utf8 = ""_""_\u212e""_\U0000212e"";
void operator""_""_(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_' and '_') in string literal concatenation}}
void operator""_""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_' and '_') in string literal concatenation}}
void operator""_\u212e""_(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_' and '_') in string literal concatenation}}
void operator""_\u212e""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_' and '_') in string literal concatenation}}
void operator""_""_(unsigned long long) {} // expected-note {{previous}}
void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}}
#define ¢ *0.01 // expected-error {{macro names must be identifiers}}
constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}}
constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}}
static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}}