Pull Lexer's CharInfo table out for general use throughout Clang.

Rewriting the same predicates over and over again is bad for code size and code maintainence. Using the functions in <ctype.h> is generally unsafe unless they are specified to be locale-independent (i.e. only isdigit and isxdigit). The next commit will try to clean up uses of <ctype.h> functions within Clang. llvm-svn: 174765
2013-02-08 22:30:22 +00:00 · 2013-02-08 22:30:22 +00:00 · a2100d755a
parent 72fffbab66
commit a2100d755a
6 changed files with 626 additions and 170 deletions
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@ -0,0 +1,162 @@
+//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_BASIC_CHARINFO_H
+#define CLANG_BASIC_CHARINFO_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace clang {
+namespace charinfo {
+  extern const uint16_t InfoTable[256];
+
+  enum {
+    CHAR_HORZ_WS  = 0x0001,  // '\t', '\f', '\v'.  Note, no '\0'
+    CHAR_VERT_WS  = 0x0002,  // '\r', '\n'
+    CHAR_SPACE    = 0x0004,  // ' '
+    CHAR_DIGIT    = 0x0008,  // 0-9
+    CHAR_XLETTER  = 0x0010,  // a-f,A-F
+    CHAR_UPPER    = 0x0020,  // A-Z
+    CHAR_LOWER    = 0x0040,  // a-z
+    CHAR_UNDER    = 0x0080,  // _
+    CHAR_PERIOD   = 0x0100,  // .
+    CHAR_RAWDEL   = 0x0200,  // {}[]#<>%:;?*+-/^&|~!=,"'
+    CHAR_PUNCT    = 0x0400   // `$@()
+  };
+
+  enum {
+    CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER,
+    CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER
+  };
+} // end namespace charinfo
+
+/// Returns true if this is an ASCII character.
+LLVM_READNONE static inline bool isASCII(char c) {
+  return static_cast<unsigned char>(c) <= 127;
+}
+
+/// Returns true if this is a valid first character of a C identifier,
+/// which is [a-zA-Z_].
+LLVM_READONLY static inline bool isIdentifierHead(unsigned char c,
+                                                  bool AllowDollar = false) {
+  using namespace charinfo;
+  if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
+    return true;
+  return AllowDollar && c == '$';
+}
+
+/// Returns true if this is a body character of a C identifier,
+/// which is [a-zA-Z0-9_].
+LLVM_READONLY static inline bool isIdentifierBody(unsigned char c,
+                                                  bool AllowDollar = false) {
+  using namespace charinfo;
+  if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
+    return true;
+  return AllowDollar && c == '$';
+}
+
+/// Returns true if this character is horizontal ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
+}
+
+/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & CHAR_VERT_WS) != 0;
+}
+
+/// Return true if this character is horizontal or vertical ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isWhitespace(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this character is an ASCII digit: [0-9]
+LLVM_READONLY static inline bool isDigit(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & CHAR_DIGIT) != 0;
+}
+
+/// Return true if this character is a lowercase ASCII letter: [a-z]
+LLVM_READONLY static inline bool isLowercase(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & CHAR_LOWER) != 0;
+}
+
+/// Return true if this character is an uppercase ASCII letter: [A-Z]
+LLVM_READONLY static inline bool isUppercase(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & CHAR_UPPER) != 0;
+}
+
+/// Return true if this character is an ASCII letter: [a-zA-Z]
+LLVM_READONLY static inline bool isLetter(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9]
+LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII hex digit: [0-9a-fA-F]
+LLVM_READONLY static inline bool isHexDigit(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0;
+    return true;
+}
+
+/// Return true if this character is an ASCII punctuation character.
+///
+/// Note that '_' is both a punctuation character and an identifier character!
+LLVM_READONLY static inline bool isPunctuation(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
+}
+
+/// Return true if this character is an ASCII printable character; that is, a
+/// character that should take exactly one column to print in a fixed-width
+/// terminal.
+LLVM_READONLY static inline bool isPrintable(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
+                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this is the body character of a C preprocessing number,
+/// which is [a-zA-Z0-9_.].
+LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] &
+          (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0;
+}
+
+/// Return true if this is the body character of a C++ raw string delimiter.
+LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) {
+  using namespace charinfo;
+  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
+                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
+}
+
+} // end namespace clang
+
+#endif
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc)

 add_clang_library(clangBasic
  Builtins.cpp
+  CharInfo.cpp
  Diagnostic.cpp
  DiagnosticIDs.cpp
  FileManager.cpp
--- a/clang/lib/Basic/CharInfo.cpp
+++ b/clang/lib/Basic/CharInfo.cpp
@ -0,0 +1,80 @@
+//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+
+// Statically initialize CharInfo table based on ASCII character set
+// Reference: FreeBSD 7.2 /usr/share/misc/ascii
+const uint16_t clang::charinfo::InfoTable[256] =
+{
+  // 0 NUL         1 SOH         2 STX         3 ETX
+  // 4 EOT         5 ENQ         6 ACK         7 BEL
+  0           , 0           , 0           , 0           ,
+  0           , 0           , 0           , 0           ,
+  // 8 BS          9 HT         10 NL         11 VT
+  //12 NP         13 CR         14 SO         15 SI
+  0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
+  CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
+  //16 DLE        17 DC1        18 DC2        19 DC3
+  //20 DC4        21 NAK        22 SYN        23 ETB
+  0           , 0           , 0           , 0           ,
+  0           , 0           , 0           , 0           ,
+  //24 CAN        25 EM         26 SUB        27 ESC
+  //28 FS         29 GS         30 RS         31 US
+  0           , 0           , 0           , 0           ,
+  0           , 0           , 0           , 0           ,
+  //32 SP         33  !         34  "         35  #
+  //36  $         37  %         38  &         39  '
+  CHAR_SPACE  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  //40  (         41  )         42  *         43  +
+  //44  ,         45  -         46  .         47  /
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
+  //48  0         49  1         50  2         51  3
+  //52  4         53  5         54  6         55  7
+  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
+  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
+  //56  8         57  9         58  :         59  ;
+  //60  <         61  =         62  >         63  ?
+  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  //64  @         65  A         66  B         67  C
+  //68  D         69  E         70  F         71  G
+  CHAR_PUNCT  , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
+  CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER  ,
+  //72  H         73  I         74  J         75  K
+  //76  L         77  M         78  N         79  O
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+  //80  P         81  Q         82  R         83  S
+  //84  T         85  U         86  V         87  W
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+  //88  X         89  Y         90  Z         91  [
+  //92  \         93  ]         94  ^         95  _
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_RAWDEL ,
+  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
+  //96  `         97  a         98  b         99  c
+  //100  d       101  e        102  f        103  g
+  CHAR_PUNCT  , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
+  CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER  ,
+  //104  h       105  i        106  j        107  k
+  //108  l       109  m        110  n        111  o
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+  //112  p       113  q        114  r        115  s
+  //116  t       117  u        118  v        119  w
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+  //120  x       121  y        122  z        123  {
+  //124  |       125  }        126  ~        127 DEL
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_RAWDEL ,
+  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+};
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -25,6 +25,7 @@
 //===----------------------------------------------------------------------===//

 #include "clang/Lex/Lexer.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/LexDiagnostic.h"
@ -38,8 +39,6 @@
 #include <cstring>
 using namespace clang;

-static void InitCharacterInfo();
-
 //===----------------------------------------------------------------------===//
 // Token Class Implementation
 //===----------------------------------------------------------------------===//
@ -66,8 +65,6 @@ void Lexer::anchor() { }

 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
                      const char *BufEnd) {
-  InitCharacterInfo();
-
  BufferStart = BufStart;
  BufferPtr = BufPtr;
  BufferEnd = BufEnd;
@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
 }


-
-static bool isWhitespace(unsigned char c);
-
 /// MeasureTokenLength - Relex the token at the specified location and return
 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
 /// includes a trigraph or an escaped newline) then this count includes bytes
@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
 }

-//===----------------------------------------------------------------------===//
-// Character information.
-//===----------------------------------------------------------------------===//
-
-enum {
-  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
-  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
-  CHAR_LETTER   = 0x04,  // a-z,A-Z
-  CHAR_NUMBER   = 0x08,  // 0-9
-  CHAR_UNDER    = 0x10,  // _
-  CHAR_PERIOD   = 0x20,  // .
-  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
-};
-
-// Statically initialize CharInfo table based on ASCII character set
-// Reference: FreeBSD 7.2 /usr/share/misc/ascii
-static const unsigned char CharInfo[256] =
-{
-// 0 NUL         1 SOH         2 STX         3 ETX
-// 4 EOT         5 ENQ         6 ACK         7 BEL
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-// 8 BS          9 HT         10 NL         11 VT
-//12 NP         13 CR         14 SO         15 SI
-   0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
-   CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
-//16 DLE        17 DC1        18 DC2        19 DC3
-//20 DC4        21 NAK        22 SYN        23 ETB
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-//24 CAN        25 EM         26 SUB        27 ESC
-//28 FS         29 GS         30 RS         31 US
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-//32 SP         33  !         34  "         35  #
-//36  $         37  %         38  &         39  '
-   CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//40  (         41  )         42  *         43  +
-//44  ,         45  -         46  .         47  /
-   0           , 0           , CHAR_RAWDEL , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
-//48  0         49  1         50  2         51  3
-//52  4         53  5         54  6         55  7
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-//56  8         57  9         58  :         59  ;
-//60  <         61  =         62  >         63  ?
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//64  @         65  A         66  B         67  C
-//68  D         69  E         70  F         71  G
-   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//72  H         73  I         74  J         75  K
-//76  L         77  M         78  N         79  O
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//80  P         81  Q         82  R         83  S
-//84  T         85  U         86  V         87  W
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//88  X         89  Y         90  Z         91  [
-//92  \         93  ]         94  ^         95  _
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
-   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
-//96  `         97  a         98  b         99  c
-//100  d       101  e        102  f        103  g
-   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//104  h       105  i        106  j        107  k
-//108  l       109  m        110  n        111  o
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//112  p       113  q        114  r        115  s
-//116  t       117  u        118  v        119  w
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//120  x       121  y        122  z        123  {
-//124  |       125  }        126  ~        127 DEL
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
-};
-
-static void InitCharacterInfo() {
-  static bool isInited = false;
-  if (isInited) return;
-  // check the statically-initialized CharInfo table
-  assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
-  assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
-  assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
-  assert(CHAR_UNDER   == CharInfo[(int)'_']);
-  assert(CHAR_PERIOD  == CharInfo[(int)'.']);
-  for (unsigned i = 'a'; i <= 'z'; ++i) {
-    assert(CHAR_LETTER == CharInfo[i]);
-    assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
-  }
-  for (unsigned i = '0'; i <= '9'; ++i)
-    assert(CHAR_NUMBER == CharInfo[i]);
-    
-  isInited = true;
-}
-
-
-/// isIdentifierHead - Return true if this is the first character of an
-/// identifier, which is [a-zA-Z_].
-static inline bool isIdentifierHead(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
-}
-
-/// isIdentifierBody - Return true if this is the body character of an
-/// identifier, which is [a-zA-Z0-9_].
-static inline bool isIdentifierBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
-}
-
-/// isHorizontalWhitespace - Return true if this character is horizontal
-/// whitespace: ' ', '\\t', '\\f', '\\v'.  Note that this returns false for
-/// '\\0'.
-static inline bool isHorizontalWhitespace(unsigned char c) {
-  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
-}
-
-/// isVerticalWhitespace - Return true if this character is vertical
-/// whitespace: '\\n', '\\r'.  Note that this returns false for '\\0'.
-static inline bool isVerticalWhitespace(unsigned char c) {
-  return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
-}
-
-/// isWhitespace - Return true if this character is horizontal or vertical
-/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.  Note that this returns
-/// false for '\\0'.
-static inline bool isWhitespace(unsigned char c) {
-  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
-}
-
-/// isNumberBody - Return true if this is the body character of an
-/// preprocessing number, which is [a-zA-Z0-9_.].
-static inline bool isNumberBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
-    true : false;
-}
-
-/// isRawStringDelimBody - Return true if this is the body character of a
-/// raw string delimiter.
-static inline bool isRawStringDelimBody(unsigned char c) {
-  return (CharInfo[c] &
-          (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
-    true : false;
-}
-
-// Allow external clients to make use of CharInfo.
 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
-  return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
+  return isIdentifierBody(c, LangOpts.DollarIdents);
 }


@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) {
         !(0xFE20 <= c && c <= 0xFE2F);
 }

-static inline bool isASCII(char C) {
-  return static_cast<signed char>(C) >= 0;
-}
-

 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
  //
-  // TODO: Could merge these checks into a CharInfo flag to make the comparison
-  // cheaper
+  // TODO: Could merge these checks into an InfoTable flag to make the
+  // comparison cheaper
  if (isASCII(C) && C != '\\' && C != '?' &&
      (C != '$' || !LangOpts.DollarIdents)) {
 FinishIdentifier:
@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  unsigned Size;
  char C = getCharAndSize(CurPtr, Size);
  char PrevCh = 0;
-  while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
    CurPtr = ConsumeChar(CurPtr, Size, Result);
    PrevCh = C;
    C = getCharAndSize(CurPtr, Size);
--- a/clang/unittests/Basic/CMakeLists.txt
+++ b/clang/unittests/Basic/CMakeLists.txt
@ -1,4 +1,5 @@
 add_clang_unittest(BasicTests
+  CharInfoTest.cpp
  FileManagerTest.cpp
  SourceManagerTest.cpp
  )
--- a/clang/unittests/Basic/CharInfoTest.cpp
+++ b/clang/unittests/Basic/CharInfoTest.cpp
@ -0,0 +1,377 @@
+//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+
+// Check that the CharInfo table has been constructed reasonably.
+TEST(CharInfoTest, validateInfoTable) {
+  using namespace charinfo;
+  EXPECT_EQ((unsigned)CHAR_SPACE,   InfoTable[(unsigned)' ']);
+  EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']);
+  EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ??
+  EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ??
+  EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']);
+  EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']);
+  EXPECT_EQ((unsigned)CHAR_UNDER,   InfoTable[(unsigned)'_']);
+  EXPECT_EQ((unsigned)CHAR_PERIOD,  InfoTable[(unsigned)'.']);
+
+  for (unsigned i = 'a'; i <= 'f'; ++i) {
+    EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]);
+    EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']);
+  }
+
+  for (unsigned i = 'g'; i <= 'z'; ++i) {
+    EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]);
+    EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']);
+  }
+
+  for (unsigned i = '0'; i <= '9'; ++i)
+    EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]);
+}
+
+// Check various predicates.
+TEST(CharInfoTest, isASCII) {
+  EXPECT_TRUE(isASCII('\0'));
+  EXPECT_TRUE(isASCII('\n'));
+  EXPECT_TRUE(isASCII(' '));
+  EXPECT_TRUE(isASCII('a'));
+  EXPECT_TRUE(isASCII('\x7f'));
+  EXPECT_FALSE(isASCII('\x80'));
+  EXPECT_FALSE(isASCII('\xc2'));
+  EXPECT_FALSE(isASCII('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierHead) {
+  EXPECT_TRUE(isIdentifierHead('a'));
+  EXPECT_TRUE(isIdentifierHead('A'));
+  EXPECT_TRUE(isIdentifierHead('z'));
+  EXPECT_TRUE(isIdentifierHead('Z'));
+  EXPECT_TRUE(isIdentifierHead('_'));
+
+  EXPECT_FALSE(isIdentifierHead('0'));
+  EXPECT_FALSE(isIdentifierHead('.'));
+  EXPECT_FALSE(isIdentifierHead('`'));
+  EXPECT_FALSE(isIdentifierHead('\0'));
+
+  EXPECT_FALSE(isIdentifierHead('$'));
+  EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true));
+
+  EXPECT_FALSE(isIdentifierHead('\x80'));
+  EXPECT_FALSE(isIdentifierHead('\xc2'));
+  EXPECT_FALSE(isIdentifierHead('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierBody) {
+  EXPECT_TRUE(isIdentifierBody('a'));
+  EXPECT_TRUE(isIdentifierBody('A'));
+  EXPECT_TRUE(isIdentifierBody('z'));
+  EXPECT_TRUE(isIdentifierBody('Z'));
+  EXPECT_TRUE(isIdentifierBody('_'));
+
+  EXPECT_TRUE(isIdentifierBody('0'));
+  EXPECT_FALSE(isIdentifierBody('.'));
+  EXPECT_FALSE(isIdentifierBody('`'));
+  EXPECT_FALSE(isIdentifierBody('\0'));
+
+  EXPECT_FALSE(isIdentifierBody('$'));
+  EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true));
+
+  EXPECT_FALSE(isIdentifierBody('\x80'));
+  EXPECT_FALSE(isIdentifierBody('\xc2'));
+  EXPECT_FALSE(isIdentifierBody('\xff'));
+}
+
+TEST(CharInfoTest, isHorizontalWhitespace) {
+  EXPECT_FALSE(isHorizontalWhitespace('a'));
+  EXPECT_FALSE(isHorizontalWhitespace('_'));
+  EXPECT_FALSE(isHorizontalWhitespace('0'));
+  EXPECT_FALSE(isHorizontalWhitespace('.'));
+  EXPECT_FALSE(isHorizontalWhitespace('`'));
+  EXPECT_FALSE(isHorizontalWhitespace('\0'));
+  EXPECT_FALSE(isHorizontalWhitespace('\x7f'));
+
+  EXPECT_TRUE(isHorizontalWhitespace(' '));
+  EXPECT_TRUE(isHorizontalWhitespace('\t'));
+  EXPECT_TRUE(isHorizontalWhitespace('\f')); // ??
+  EXPECT_TRUE(isHorizontalWhitespace('\v')); // ??
+
+  EXPECT_FALSE(isHorizontalWhitespace('\n'));
+  EXPECT_FALSE(isHorizontalWhitespace('\r'));
+
+  EXPECT_FALSE(isHorizontalWhitespace('\x80'));
+  EXPECT_FALSE(isHorizontalWhitespace('\xc2'));
+  EXPECT_FALSE(isHorizontalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isVerticalWhitespace) {
+  EXPECT_FALSE(isVerticalWhitespace('a'));
+  EXPECT_FALSE(isVerticalWhitespace('_'));
+  EXPECT_FALSE(isVerticalWhitespace('0'));
+  EXPECT_FALSE(isVerticalWhitespace('.'));
+  EXPECT_FALSE(isVerticalWhitespace('`'));
+  EXPECT_FALSE(isVerticalWhitespace('\0'));
+  EXPECT_FALSE(isVerticalWhitespace('\x7f'));
+
+  EXPECT_FALSE(isVerticalWhitespace(' '));
+  EXPECT_FALSE(isVerticalWhitespace('\t'));
+  EXPECT_FALSE(isVerticalWhitespace('\f')); // ??
+  EXPECT_FALSE(isVerticalWhitespace('\v')); // ??
+
+  EXPECT_TRUE(isVerticalWhitespace('\n'));
+  EXPECT_TRUE(isVerticalWhitespace('\r'));
+
+  EXPECT_FALSE(isVerticalWhitespace('\x80'));
+  EXPECT_FALSE(isVerticalWhitespace('\xc2'));
+  EXPECT_FALSE(isVerticalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isWhitespace) {
+  EXPECT_FALSE(isWhitespace('a'));
+  EXPECT_FALSE(isWhitespace('_'));
+  EXPECT_FALSE(isWhitespace('0'));
+  EXPECT_FALSE(isWhitespace('.'));
+  EXPECT_FALSE(isWhitespace('`'));
+  EXPECT_FALSE(isWhitespace('\0'));
+  EXPECT_FALSE(isWhitespace('\x7f'));
+
+  EXPECT_TRUE(isWhitespace(' '));
+  EXPECT_TRUE(isWhitespace('\t'));
+  EXPECT_TRUE(isWhitespace('\f'));
+  EXPECT_TRUE(isWhitespace('\v'));
+
+  EXPECT_TRUE(isWhitespace('\n'));
+  EXPECT_TRUE(isWhitespace('\r'));
+
+  EXPECT_FALSE(isWhitespace('\x80'));
+  EXPECT_FALSE(isWhitespace('\xc2'));
+  EXPECT_FALSE(isWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isDigit) {
+  EXPECT_TRUE(isDigit('0'));
+  EXPECT_TRUE(isDigit('9'));
+
+  EXPECT_FALSE(isDigit('a'));
+  EXPECT_FALSE(isDigit('A'));
+
+  EXPECT_FALSE(isDigit('z'));
+  EXPECT_FALSE(isDigit('Z'));
+  
+  EXPECT_FALSE(isDigit('.'));
+  EXPECT_FALSE(isDigit('_'));
+
+  EXPECT_FALSE(isDigit('/'));
+  EXPECT_FALSE(isDigit('\0'));
+
+  EXPECT_FALSE(isDigit('\x80'));
+  EXPECT_FALSE(isDigit('\xc2'));
+  EXPECT_FALSE(isDigit('\xff'));
+}
+
+TEST(CharInfoTest, isHexDigit) {
+  EXPECT_TRUE(isHexDigit('0'));
+  EXPECT_TRUE(isHexDigit('9'));
+
+  EXPECT_TRUE(isHexDigit('a'));
+  EXPECT_TRUE(isHexDigit('A'));
+
+  EXPECT_FALSE(isHexDigit('z'));
+  EXPECT_FALSE(isHexDigit('Z'));
+  
+  EXPECT_FALSE(isHexDigit('.'));
+  EXPECT_FALSE(isHexDigit('_'));
+
+  EXPECT_FALSE(isHexDigit('/'));
+  EXPECT_FALSE(isHexDigit('\0'));
+
+  EXPECT_FALSE(isHexDigit('\x80'));
+  EXPECT_FALSE(isHexDigit('\xc2'));
+  EXPECT_FALSE(isHexDigit('\xff'));
+}
+
+TEST(CharInfoTest, isLetter) {
+  EXPECT_FALSE(isLetter('0'));
+  EXPECT_FALSE(isLetter('9'));
+
+  EXPECT_TRUE(isLetter('a'));
+  EXPECT_TRUE(isLetter('A'));
+
+  EXPECT_TRUE(isLetter('z'));
+  EXPECT_TRUE(isLetter('Z'));
+  
+  EXPECT_FALSE(isLetter('.'));
+  EXPECT_FALSE(isLetter('_'));
+
+  EXPECT_FALSE(isLetter('/'));
+  EXPECT_FALSE(isLetter('('));
+  EXPECT_FALSE(isLetter('\0'));
+
+  EXPECT_FALSE(isLetter('\x80'));
+  EXPECT_FALSE(isLetter('\xc2'));
+  EXPECT_FALSE(isLetter('\xff'));
+}
+
+TEST(CharInfoTest, isLowercase) {
+  EXPECT_FALSE(isLowercase('0'));
+  EXPECT_FALSE(isLowercase('9'));
+
+  EXPECT_TRUE(isLowercase('a'));
+  EXPECT_FALSE(isLowercase('A'));
+
+  EXPECT_TRUE(isLowercase('z'));
+  EXPECT_FALSE(isLowercase('Z'));
+  
+  EXPECT_FALSE(isLowercase('.'));
+  EXPECT_FALSE(isLowercase('_'));
+
+  EXPECT_FALSE(isLowercase('/'));
+  EXPECT_FALSE(isLowercase('('));
+  EXPECT_FALSE(isLowercase('\0'));
+
+  EXPECT_FALSE(isLowercase('\x80'));
+  EXPECT_FALSE(isLowercase('\xc2'));
+  EXPECT_FALSE(isLowercase('\xff'));
+}
+
+TEST(CharInfoTest, isUppercase) {
+  EXPECT_FALSE(isUppercase('0'));
+  EXPECT_FALSE(isUppercase('9'));
+
+  EXPECT_FALSE(isUppercase('a'));
+  EXPECT_TRUE(isUppercase('A'));
+
+  EXPECT_FALSE(isUppercase('z'));
+  EXPECT_TRUE(isUppercase('Z'));
+
+  EXPECT_FALSE(isUppercase('.'));
+  EXPECT_FALSE(isUppercase('_'));
+
+  EXPECT_FALSE(isUppercase('/'));
+  EXPECT_FALSE(isUppercase('('));
+  EXPECT_FALSE(isUppercase('\0'));
+
+  EXPECT_FALSE(isUppercase('\x80'));
+  EXPECT_FALSE(isUppercase('\xc2'));
+  EXPECT_FALSE(isUppercase('\xff'));
+}
+
+TEST(CharInfoTest, isAlphanumeric) {
+  EXPECT_TRUE(isAlphanumeric('0'));
+  EXPECT_TRUE(isAlphanumeric('9'));
+
+  EXPECT_TRUE(isAlphanumeric('a'));
+  EXPECT_TRUE(isAlphanumeric('A'));
+
+  EXPECT_TRUE(isAlphanumeric('z'));
+  EXPECT_TRUE(isAlphanumeric('Z'));
+
+  EXPECT_FALSE(isAlphanumeric('.'));
+  EXPECT_FALSE(isAlphanumeric('_'));
+
+  EXPECT_FALSE(isAlphanumeric('/'));
+  EXPECT_FALSE(isAlphanumeric('('));
+  EXPECT_FALSE(isAlphanumeric('\0'));
+
+  EXPECT_FALSE(isAlphanumeric('\x80'));
+  EXPECT_FALSE(isAlphanumeric('\xc2'));
+  EXPECT_FALSE(isAlphanumeric('\xff'));
+}
+
+TEST(CharInfoTest, isPunctuation) {
+  EXPECT_FALSE(isPunctuation('0'));
+  EXPECT_FALSE(isPunctuation('9'));
+
+  EXPECT_FALSE(isPunctuation('a'));
+  EXPECT_FALSE(isPunctuation('A'));
+
+  EXPECT_FALSE(isPunctuation('z'));
+  EXPECT_FALSE(isPunctuation('Z'));
+
+  EXPECT_TRUE(isPunctuation('.'));
+  EXPECT_TRUE(isPunctuation('_'));
+
+  EXPECT_TRUE(isPunctuation('/'));
+  EXPECT_TRUE(isPunctuation('('));
+
+  EXPECT_FALSE(isPunctuation(' '));
+  EXPECT_FALSE(isPunctuation('\n'));
+  EXPECT_FALSE(isPunctuation('\0'));
+
+  EXPECT_FALSE(isPunctuation('\x80'));
+  EXPECT_FALSE(isPunctuation('\xc2'));
+  EXPECT_FALSE(isPunctuation('\xff'));
+}
+
+TEST(CharInfoTest, isPrintable) {
+  EXPECT_TRUE(isPrintable('0'));
+  EXPECT_TRUE(isPrintable('9'));
+
+  EXPECT_TRUE(isPrintable('a'));
+  EXPECT_TRUE(isPrintable('A'));
+
+  EXPECT_TRUE(isPrintable('z'));
+  EXPECT_TRUE(isPrintable('Z'));
+
+  EXPECT_TRUE(isPrintable('.'));
+  EXPECT_TRUE(isPrintable('_'));
+
+  EXPECT_TRUE(isPrintable('/'));
+  EXPECT_TRUE(isPrintable('('));
+
+  EXPECT_TRUE(isPrintable(' '));
+  EXPECT_FALSE(isPrintable('\t'));
+  EXPECT_FALSE(isPrintable('\n'));
+  EXPECT_FALSE(isPrintable('\0'));
+
+  EXPECT_FALSE(isPrintable('\x80'));
+  EXPECT_FALSE(isPrintable('\xc2'));
+  EXPECT_FALSE(isPrintable('\xff'));
+}
+
+TEST(CharInfoTest, isPreprocessingNumberBody) {
+  EXPECT_TRUE(isPreprocessingNumberBody('0'));
+  EXPECT_TRUE(isPreprocessingNumberBody('9'));
+
+  EXPECT_TRUE(isPreprocessingNumberBody('a'));
+  EXPECT_TRUE(isPreprocessingNumberBody('A'));
+
+  EXPECT_TRUE(isPreprocessingNumberBody('z'));
+  EXPECT_TRUE(isPreprocessingNumberBody('Z'));
+  EXPECT_TRUE(isPreprocessingNumberBody('.'));
+  EXPECT_TRUE(isPreprocessingNumberBody('_'));
+
+  EXPECT_FALSE(isPreprocessingNumberBody('/'));
+  EXPECT_FALSE(isPreprocessingNumberBody('('));
+  EXPECT_FALSE(isPreprocessingNumberBody('\0'));
+
+  EXPECT_FALSE(isPreprocessingNumberBody('\x80'));
+  EXPECT_FALSE(isPreprocessingNumberBody('\xc2'));
+  EXPECT_FALSE(isPreprocessingNumberBody('\xff'));
+}
+
+TEST(CharInfoTest, isRawStringDelimBody) {
+  EXPECT_TRUE(isRawStringDelimBody('0'));
+  EXPECT_TRUE(isRawStringDelimBody('9'));
+
+  EXPECT_TRUE(isRawStringDelimBody('a'));
+  EXPECT_TRUE(isRawStringDelimBody('A'));
+
+  EXPECT_TRUE(isRawStringDelimBody('z'));
+  EXPECT_TRUE(isRawStringDelimBody('Z'));
+  EXPECT_TRUE(isRawStringDelimBody('.'));
+  EXPECT_TRUE(isRawStringDelimBody('_'));
+
+  EXPECT_TRUE(isRawStringDelimBody('/'));
+  EXPECT_FALSE(isRawStringDelimBody('('));
+  EXPECT_FALSE(isRawStringDelimBody('\0'));
+}