From 7b753d21b54d042cae9ee97b4d75c18babfbfd76 Mon Sep 17 00:00:00 2001 From: Steve Naroff Date: Mon, 30 Mar 2009 23:46:03 +0000 Subject: [PATCH] Implement UCN support for C string literals (C99 6.4.3) and add some very basic tests. Chris Goller has graciously offered to write some test to help validate UCN support. From a front-end perspective, I believe this code should work for ObjC @-strings. At the moment, I believe we need to tweak the code generation for @-strings (which doesn't appear to handle them). Will be investigating. llvm-svn: 68076 --- .../include/clang/Basic/DiagnosticLexKinds.td | 4 + clang/lib/Lex/LiteralSupport.cpp | 115 +++++++++++++++--- clang/test/Sema/ucn-cstring.c | 15 +++ 3 files changed, 120 insertions(+), 14 deletions(-) create mode 100644 clang/test/Sema/ucn-cstring.c diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 98f1be2fe419..82ebdaddc172 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -52,6 +52,10 @@ def ext_nonstandard_escape : Extension< "use of non-standard escape character '\\%0'">; def ext_unknown_escape : Extension<"unknown escape sequence '\\%0'">; def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">; +def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">; +def err_ucn_escape_invalid : Error<"invalid universal character">; +def err_ucn_escape_incomplete : Error<"incomplete universal character name">; +def err_ucn_escape_too_big : Error<"universal character name is too long">; def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">; def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">; def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">; diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index c20383f03133..dcd239d5abd4 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -71,8 +71,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, case 'v': ResultChar = 11; break; - - //case 'u': case 'U': // FIXME: UCNs. case 'x': { // Hex escape. ResultChar = 0; if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { @@ -151,7 +149,90 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, return ResultChar; } +/// ProcessUCNEscape - Read the Universal Character Name, check constraints and +/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser. +/// When we decide to implement UCN's for character constants and identifiers, +/// we will likely rework our support for UCN's. +static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, + char *&ResultBuf, const char *ResultBufEnd, + bool &HadError, + SourceLocation Loc, Preprocessor &PP) { + // FIXME: Add a warning - UCN's are only valid in C++ & C99. + + // Skip the '\u' char's. + ThisTokBuf += 2; + if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { + PP.Diag(Loc, diag::err_ucn_escape_no_digits); + HadError = 1; + return; + } + typedef unsigned int UTF32; + + UTF32 UcnVal = 0; + unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); + for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) { + int CharVal = HexDigitValue(ThisTokBuf[0]); + if (CharVal == -1) break; + UcnVal <<= 4; + UcnVal |= CharVal; + } + // If we didn't consume the proper number of digits, there is a problem. + if (UcnLen) { + PP.Diag(Loc, diag::err_ucn_escape_incomplete); + HadError = 1; + return; + } + // Check UCN constraints (C99 6.4.3p2) + if ((UcnVal < 0xa0 && + (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, ` + || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) { + PP.Diag(Loc, diag::err_ucn_escape_invalid); + HadError = 1; + return; + } + // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. + // The conversion below was inspired by: + // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c + // First, we determine how many bytes the result will require. + typedef unsigned char UTF8; + + unsigned short bytesToWrite = 0; + if (UcnVal < (UTF32)0x80) + bytesToWrite = 1; + else if (UcnVal < (UTF32)0x800) + bytesToWrite = 2; + else if (UcnVal < (UTF32)0x10000) + bytesToWrite = 3; + else + bytesToWrite = 4; + + // If the buffer isn't big enough, bail. + if ((ResultBuf + bytesToWrite) >= ResultBufEnd) { + PP.Diag(Loc, diag::err_ucn_escape_too_big); + HadError = 1; + return; + } + const unsigned byteMask = 0xBF; + const unsigned byteMark = 0x80; + + // Once the bits are split out into bytes of UTF8, this is a mask OR-ed + // into the first byte, depending on how many bytes follow. There are + // as many entries in this table as there are UTF8 sequence types. + static const UTF8 firstByteMark[7] = { + 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC + }; + // Finally, we write the bytes into ResultBuf. + ResultBuf += bytesToWrite; + switch (bytesToWrite) { // note: everything falls through. + case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); + } + // Update the buffer. + ResultBuf += bytesToWrite; +} /// integer-constant: [C99 6.4.4.1] @@ -757,23 +838,29 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks, *ResultPtr++ = InStart[0]; // Add zeros at the end. for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) - *ResultPtr++ = 0; + *ResultPtr++ = 0; } } continue; } - // Otherwise, this is an escape character. Process it. - unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, - StringToks[i].getLocation(), - ThisIsWide, PP); - - // Note: our internal rep of wide char tokens is always little-endian. - *ResultPtr++ = ResultChar & 0xFF; - - if (AnyWide) { - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) - *ResultPtr++ = ResultChar >> i*8; + if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { + ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, + GetString() + ResultBuf.size(), + hadError, StringToks[i].getLocation(), PP); + } else { + // Otherwise, this is a non-UCN escape character. Process it. + unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, + StringToks[i].getLocation(), + ThisIsWide, PP); + + // Note: our internal rep of wide char tokens is always little-endian. + *ResultPtr++ = ResultChar & 0xFF; + + if (AnyWide) { + for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + *ResultPtr++ = ResultChar >> i*8; + } } } } diff --git a/clang/test/Sema/ucn-cstring.c b/clang/test/Sema/ucn-cstring.c new file mode 100644 index 000000000000..ec760f4180e7 --- /dev/null +++ b/clang/test/Sema/ucn-cstring.c @@ -0,0 +1,15 @@ +// RUN: clang-cc %s -verify -fsyntax-only -pedantic + +#include + +int main(void) { + printf("%s (%d)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world")); + printf("%s (%d)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B")); + // Some error conditions... + printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}} + printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}} + printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}} + printf("%s\n", "\u0001"); // expected-error{{invalid universal character}} + return 0; +} +