From 7b753d21b54d042cae9ee97b4d75c18babfbfd76 Mon Sep 17 00:00:00 2001
From: Steve Naroff <snaroff@apple.com>
Date: Mon, 30 Mar 2009 23:46:03 +0000
Subject: [PATCH] Implement UCN support for C string literals (C99 6.4.3) and
 add some very basic tests. Chris Goller has graciously offered to write some
 test to help validate UCN support.

From a front-end perspective, I believe this code should work for ObjC @-strings. At the moment, I believe we need to tweak the code generation for @-strings (which doesn't appear to handle them). Will be investigating.

llvm-svn: 68076
---
 .../include/clang/Basic/DiagnosticLexKinds.td |   4 +
 clang/lib/Lex/LiteralSupport.cpp              | 115 +++++++++++++++---
 clang/test/Sema/ucn-cstring.c                 |  15 +++
 3 files changed, 120 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/Sema/ucn-cstring.c

diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 98f1be2fe419..82ebdaddc172 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -52,6 +52,10 @@ def ext_nonstandard_escape : Extension<
   "use of non-standard escape character '\\%0'">;
 def ext_unknown_escape : Extension<"unknown escape sequence '\\%0'">;
 def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">;
+def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">;
+def err_ucn_escape_invalid : Error<"invalid universal character">;
+def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+def err_ucn_escape_too_big : Error<"universal character name is too long">;
 def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
 def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
 def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index c20383f03133..dcd239d5abd4 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -71,8 +71,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
   case 'v':
     ResultChar = 11;
     break;
-    
-    //case 'u': case 'U':  // FIXME: UCNs.
   case 'x': { // Hex escape.
     ResultChar = 0;
     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
@@ -151,7 +149,90 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
   return ResultChar;
 }
 
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
+/// When we decide to implement UCN's for character constants and identifiers,
+/// we will likely rework our support for UCN's.
+static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 
+                             char *&ResultBuf, const char *ResultBufEnd,
+                             bool &HadError, 
+                             SourceLocation Loc, Preprocessor &PP) {
+  // FIXME: Add a warning - UCN's are only valid in C++ & C99.
+  
+  // Skip the '\u' char's.
+  ThisTokBuf += 2;
 
+  if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
+    PP.Diag(Loc, diag::err_ucn_escape_no_digits);
+    HadError = 1;
+    return;
+  }
+  typedef unsigned int UTF32;
+  
+  UTF32 UcnVal = 0;
+  unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+  for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+    int CharVal = HexDigitValue(ThisTokBuf[0]);
+    if (CharVal == -1) break;
+    UcnVal <<= 4;
+    UcnVal |= CharVal;
+  }
+  // If we didn't consume the proper number of digits, there is a problem.
+  if (UcnLen) {
+    PP.Diag(Loc, diag::err_ucn_escape_incomplete);
+    HadError = 1;
+    return;
+  }
+  // Check UCN constraints (C99 6.4.3p2)
+  if ((UcnVal < 0xa0 &&
+      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
+      || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) {
+    PP.Diag(Loc, diag::err_ucn_escape_invalid);
+    HadError = 1;
+    return;
+  }
+  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
+  // The conversion below was inspired by:
+  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+  // First, we determine how many bytes the result will require. 
+  typedef unsigned char UTF8;
+
+  unsigned short bytesToWrite = 0;
+  if (UcnVal < (UTF32)0x80)
+    bytesToWrite = 1;
+  else if (UcnVal < (UTF32)0x800)
+    bytesToWrite = 2;
+  else if (UcnVal < (UTF32)0x10000)
+    bytesToWrite = 3;
+  else
+    bytesToWrite = 4;
+	
+  // If the buffer isn't big enough, bail.
+  if ((ResultBuf + bytesToWrite) >= ResultBufEnd) {
+    PP.Diag(Loc, diag::err_ucn_escape_too_big);
+    HadError = 1;
+    return;
+  }
+  const unsigned byteMask = 0xBF;
+  const unsigned byteMark = 0x80;
+  
+  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
+  // into the first byte, depending on how many bytes follow.  There are
+  // as many entries in this table as there are UTF8 sequence types.
+  static const UTF8 firstByteMark[7] = { 
+    0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 
+  };
+  // Finally, we write the bytes into ResultBuf.
+  ResultBuf += bytesToWrite;
+  switch (bytesToWrite) { // note: everything falls through.
+    case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
+  }
+  // Update the buffer.
+  ResultBuf += bytesToWrite;
+}
 
 
 ///       integer-constant: [C99 6.4.4.1]
@@ -757,23 +838,29 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
             *ResultPtr++ = InStart[0];
             // Add zeros at the end.
             for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-            *ResultPtr++ = 0;
+              *ResultPtr++ = 0;
           }
         }
         continue;
       }
       
-      // Otherwise, this is an escape character.  Process it.
-      unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
-                                              StringToks[i].getLocation(),
-                                              ThisIsWide, PP);
-      
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
-      
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
+      if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+        ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, 
+                         GetString() + ResultBuf.size(),
+                         hadError, StringToks[i].getLocation(), PP);
+      } else {
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+                                                StringToks[i].getLocation(),
+                                                ThisIsWide, PP);
+        
+        // Note: our internal rep of wide char tokens is always little-endian.
+        *ResultPtr++ = ResultChar & 0xFF;
+        
+        if (AnyWide) {
+          for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+            *ResultPtr++ = ResultChar >> i*8;
+        }
       }
     }
   }
diff --git a/clang/test/Sema/ucn-cstring.c b/clang/test/Sema/ucn-cstring.c
new file mode 100644
index 000000000000..ec760f4180e7
--- /dev/null
+++ b/clang/test/Sema/ucn-cstring.c
@@ -0,0 +1,15 @@
+// RUN: clang-cc %s -verify -fsyntax-only -pedantic
+
+#include <stdio.h>
+
+int main(void) {
+  printf("%s (%d)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world"));
+  printf("%s (%d)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B"));
+  // Some error conditions...
+  printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}}
+  printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}}
+  printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}}
+  printf("%s\n", "\u0001"); // expected-error{{invalid universal character}}
+  return 0;
+}
+