Pull Lexer's CharInfo table out for general use throughout Clang.

Rewriting the same predicates over and over again is bad for code size and
code maintainence. Using the functions in <ctype.h> is generally unsafe
unless they are specified to be locale-independent (i.e. only isdigit and
isxdigit).

The next commit will try to clean up uses of <ctype.h> functions within Clang.

llvm-svn: 174765
This commit is contained in:
Jordan Rose 2013-02-08 22:30:22 +00:00
parent 72fffbab66
commit a2100d755a
6 changed files with 626 additions and 170 deletions

View File

@ -0,0 +1,162 @@
//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef CLANG_BASIC_CHARINFO_H
#define CLANG_BASIC_CHARINFO_H
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DataTypes.h"
namespace clang {
namespace charinfo {
extern const uint16_t InfoTable[256];
enum {
CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0'
CHAR_VERT_WS = 0x0002, // '\r', '\n'
CHAR_SPACE = 0x0004, // ' '
CHAR_DIGIT = 0x0008, // 0-9
CHAR_XLETTER = 0x0010, // a-f,A-F
CHAR_UPPER = 0x0020, // A-Z
CHAR_LOWER = 0x0040, // a-z
CHAR_UNDER = 0x0080, // _
CHAR_PERIOD = 0x0100, // .
CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"'
CHAR_PUNCT = 0x0400 // `$@()
};
enum {
CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER,
CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER
};
} // end namespace charinfo
/// Returns true if this is an ASCII character.
LLVM_READNONE static inline bool isASCII(char c) {
return static_cast<unsigned char>(c) <= 127;
}
/// Returns true if this is a valid first character of a C identifier,
/// which is [a-zA-Z_].
LLVM_READONLY static inline bool isIdentifierHead(unsigned char c,
bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
return true;
return AllowDollar && c == '$';
}
/// Returns true if this is a body character of a C identifier,
/// which is [a-zA-Z0-9_].
LLVM_READONLY static inline bool isIdentifierBody(unsigned char c,
bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
return true;
return AllowDollar && c == '$';
}
/// Returns true if this character is horizontal ASCII whitespace:
/// ' ', '\\t', '\\f', '\\v'.
///
/// Note that this returns false for '\\0'.
LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
}
/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'.
///
/// Note that this returns false for '\\0'.
LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & CHAR_VERT_WS) != 0;
}
/// Return true if this character is horizontal or vertical ASCII whitespace:
/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.
///
/// Note that this returns false for '\\0'.
LLVM_READONLY static inline bool isWhitespace(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0;
}
/// Return true if this character is an ASCII digit: [0-9]
LLVM_READONLY static inline bool isDigit(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & CHAR_DIGIT) != 0;
}
/// Return true if this character is a lowercase ASCII letter: [a-z]
LLVM_READONLY static inline bool isLowercase(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & CHAR_LOWER) != 0;
}
/// Return true if this character is an uppercase ASCII letter: [A-Z]
LLVM_READONLY static inline bool isUppercase(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & CHAR_UPPER) != 0;
}
/// Return true if this character is an ASCII letter: [a-zA-Z]
LLVM_READONLY static inline bool isLetter(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0;
}
/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9]
LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0;
}
/// Return true if this character is an ASCII hex digit: [0-9a-fA-F]
LLVM_READONLY static inline bool isHexDigit(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0;
return true;
}
/// Return true if this character is an ASCII punctuation character.
///
/// Note that '_' is both a punctuation character and an identifier character!
LLVM_READONLY static inline bool isPunctuation(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
}
/// Return true if this character is an ASCII printable character; that is, a
/// character that should take exactly one column to print in a fixed-width
/// terminal.
LLVM_READONLY static inline bool isPrintable(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
}
/// Return true if this is the body character of a C preprocessing number,
/// which is [a-zA-Z0-9_.].
LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] &
(CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0;
}
/// Return true if this is the body character of a C++ raw string delimiter.
LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) {
using namespace charinfo;
return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
}
} // end namespace clang
#endif

View File

@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc)
add_clang_library(clangBasic
Builtins.cpp
CharInfo.cpp
Diagnostic.cpp
DiagnosticIDs.cpp
FileManager.cpp

View File

@ -0,0 +1,80 @@
//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/CharInfo.h"
// Statically initialize CharInfo table based on ASCII character set
// Reference: FreeBSD 7.2 /usr/share/misc/ascii
const uint16_t clang::charinfo::InfoTable[256] =
{
// 0 NUL 1 SOH 2 STX 3 ETX
// 4 EOT 5 ENQ 6 ACK 7 BEL
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
// 8 BS 9 HT 10 NL 11 VT
//12 NP 13 CR 14 SO 15 SI
0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
//16 DLE 17 DC1 18 DC2 19 DC3
//20 DC4 21 NAK 22 SYN 23 ETB
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
//24 CAN 25 EM 26 SUB 27 ESC
//28 FS 29 GS 30 RS 31 US
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
//32 SP 33 ! 34 " 35 #
//36 $ 37 % 38 & 39 '
CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//40 ( 41 ) 42 * 43 +
//44 , 45 - 46 . 47 /
CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
//48 0 49 1 50 2 51 3
//52 4 53 5 54 6 55 7
CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
//56 8 57 9 58 : 59 ;
//60 < 61 = 62 > 63 ?
CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//64 @ 65 A 66 B 67 C
//68 D 69 E 70 F 71 G
CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER ,
//72 H 73 I 74 J 75 K
//76 L 77 M 78 N 79 O
CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
//80 P 81 Q 82 R 83 S
//84 T 85 U 86 V 87 W
CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
//88 X 89 Y 90 Z 91 [
//92 \ 93 ] 94 ^ 95 _
CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL ,
CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
//96 ` 97 a 98 b 99 c
//100 d 101 e 102 f 103 g
CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER ,
//104 h 105 i 106 j 107 k
//108 l 109 m 110 n 111 o
CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
//112 p 113 q 114 r 115 s
//116 t 117 u 118 v 119 w
CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
//120 x 121 y 122 z 123 {
//124 | 125 } 126 ~ 127 DEL
CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
};

View File

@ -25,6 +25,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Lex/Lexer.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/CodeCompletionHandler.h"
#include "clang/Lex/LexDiagnostic.h"
@ -38,8 +39,6 @@
#include <cstring>
using namespace clang;
static void InitCharacterInfo();
//===----------------------------------------------------------------------===//
// Token Class Implementation
//===----------------------------------------------------------------------===//
@ -66,8 +65,6 @@ void Lexer::anchor() { }
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
InitCharacterInfo();
BufferStart = BufStart;
BufferPtr = BufPtr;
BufferEnd = BufEnd;
@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
}
static bool isWhitespace(unsigned char c);
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
}
//===----------------------------------------------------------------------===//
// Character information.
//===----------------------------------------------------------------------===//
enum {
CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
CHAR_VERT_WS = 0x02, // '\r', '\n'
CHAR_LETTER = 0x04, // a-z,A-Z
CHAR_NUMBER = 0x08, // 0-9
CHAR_UNDER = 0x10, // _
CHAR_PERIOD = 0x20, // .
CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"'
};
// Statically initialize CharInfo table based on ASCII character set
// Reference: FreeBSD 7.2 /usr/share/misc/ascii
static const unsigned char CharInfo[256] =
{
// 0 NUL 1 SOH 2 STX 3 ETX
// 4 EOT 5 ENQ 6 ACK 7 BEL
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
// 8 BS 9 HT 10 NL 11 VT
//12 NP 13 CR 14 SO 15 SI
0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
//16 DLE 17 DC1 18 DC2 19 DC3
//20 DC4 21 NAK 22 SYN 23 ETB
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
//24 CAN 25 EM 26 SUB 27 ESC
//28 FS 29 GS 30 RS 31 US
0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 ,
//32 SP 33 ! 34 " 35 #
//36 $ 37 % 38 & 39 '
CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//40 ( 41 ) 42 * 43 +
//44 , 45 - 46 . 47 /
0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
//48 0 49 1 50 2 51 3
//52 4 53 5 54 6 55 7
CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
//56 8 57 9 58 : 59 ;
//60 < 61 = 62 > 63 ?
CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//64 @ 65 A 66 B 67 C
//68 D 69 E 70 F 71 G
0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//72 H 73 I 74 J 75 K
//76 L 77 M 78 N 79 O
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//80 P 81 Q 82 R 83 S
//84 T 85 U 86 V 87 W
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//88 X 89 Y 90 Z 91 [
//92 \ 93 ] 94 ^ 95 _
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
//96 ` 97 a 98 b 99 c
//100 d 101 e 102 f 103 g
0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//104 h 105 i 106 j 107 k
//108 l 109 m 110 n 111 o
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//112 p 113 q 114 r 115 s
//116 t 117 u 118 v 119 w
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//120 x 121 y 122 z 123 {
//124 | 125 } 126 ~ 127 DEL
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
};
static void InitCharacterInfo() {
static bool isInited = false;
if (isInited) return;
// check the statically-initialized CharInfo table
assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
assert(CHAR_UNDER == CharInfo[(int)'_']);
assert(CHAR_PERIOD == CharInfo[(int)'.']);
for (unsigned i = 'a'; i <= 'z'; ++i) {
assert(CHAR_LETTER == CharInfo[i]);
assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
}
for (unsigned i = '0'; i <= '9'; ++i)
assert(CHAR_NUMBER == CharInfo[i]);
isInited = true;
}
/// isIdentifierHead - Return true if this is the first character of an
/// identifier, which is [a-zA-Z_].
static inline bool isIdentifierHead(unsigned char c) {
return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
}
/// isIdentifierBody - Return true if this is the body character of an
/// identifier, which is [a-zA-Z0-9_].
static inline bool isIdentifierBody(unsigned char c) {
return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
}
/// isHorizontalWhitespace - Return true if this character is horizontal
/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for
/// '\\0'.
static inline bool isHorizontalWhitespace(unsigned char c) {
return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
}
/// isVerticalWhitespace - Return true if this character is vertical
/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'.
static inline bool isVerticalWhitespace(unsigned char c) {
return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
}
/// isWhitespace - Return true if this character is horizontal or vertical
/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns
/// false for '\\0'.
static inline bool isWhitespace(unsigned char c) {
return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
}
/// isNumberBody - Return true if this is the body character of an
/// preprocessing number, which is [a-zA-Z0-9_.].
static inline bool isNumberBody(unsigned char c) {
return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
true : false;
}
/// isRawStringDelimBody - Return true if this is the body character of a
/// raw string delimiter.
static inline bool isRawStringDelimBody(unsigned char c) {
return (CharInfo[c] &
(CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
true : false;
}
// Allow external clients to make use of CharInfo.
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
return isIdentifierBody(c, LangOpts.DollarIdents);
}
@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) {
!(0xFE20 <= c && c <= 0xFE2F);
}
static inline bool isASCII(char C) {
return static_cast<signed char>(C) >= 0;
}
void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Fast path, no $,\,? in identifier found. '\' might be an escaped newline
// or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
//
// TODO: Could merge these checks into a CharInfo flag to make the comparison
// cheaper
// TODO: Could merge these checks into an InfoTable flag to make the
// comparison cheaper
if (isASCII(C) && C != '\\' && C != '?' &&
(C != '$' || !LangOpts.DollarIdents)) {
FinishIdentifier:
@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);

View File

@ -1,4 +1,5 @@
add_clang_unittest(BasicTests
CharInfoTest.cpp
FileManagerTest.cpp
SourceManagerTest.cpp
)

View File

@ -0,0 +1,377 @@
//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/CharInfo.h"
#include "gtest/gtest.h"
using namespace llvm;
using namespace clang;
// Check that the CharInfo table has been constructed reasonably.
TEST(CharInfoTest, validateInfoTable) {
using namespace charinfo;
EXPECT_EQ((unsigned)CHAR_SPACE, InfoTable[(unsigned)' ']);
EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']);
EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ??
EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ??
EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']);
EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']);
EXPECT_EQ((unsigned)CHAR_UNDER, InfoTable[(unsigned)'_']);
EXPECT_EQ((unsigned)CHAR_PERIOD, InfoTable[(unsigned)'.']);
for (unsigned i = 'a'; i <= 'f'; ++i) {
EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]);
EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']);
}
for (unsigned i = 'g'; i <= 'z'; ++i) {
EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]);
EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']);
}
for (unsigned i = '0'; i <= '9'; ++i)
EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]);
}
// Check various predicates.
TEST(CharInfoTest, isASCII) {
EXPECT_TRUE(isASCII('\0'));
EXPECT_TRUE(isASCII('\n'));
EXPECT_TRUE(isASCII(' '));
EXPECT_TRUE(isASCII('a'));
EXPECT_TRUE(isASCII('\x7f'));
EXPECT_FALSE(isASCII('\x80'));
EXPECT_FALSE(isASCII('\xc2'));
EXPECT_FALSE(isASCII('\xff'));
}
TEST(CharInfoTest, isIdentifierHead) {
EXPECT_TRUE(isIdentifierHead('a'));
EXPECT_TRUE(isIdentifierHead('A'));
EXPECT_TRUE(isIdentifierHead('z'));
EXPECT_TRUE(isIdentifierHead('Z'));
EXPECT_TRUE(isIdentifierHead('_'));
EXPECT_FALSE(isIdentifierHead('0'));
EXPECT_FALSE(isIdentifierHead('.'));
EXPECT_FALSE(isIdentifierHead('`'));
EXPECT_FALSE(isIdentifierHead('\0'));
EXPECT_FALSE(isIdentifierHead('$'));
EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true));
EXPECT_FALSE(isIdentifierHead('\x80'));
EXPECT_FALSE(isIdentifierHead('\xc2'));
EXPECT_FALSE(isIdentifierHead('\xff'));
}
TEST(CharInfoTest, isIdentifierBody) {
EXPECT_TRUE(isIdentifierBody('a'));
EXPECT_TRUE(isIdentifierBody('A'));
EXPECT_TRUE(isIdentifierBody('z'));
EXPECT_TRUE(isIdentifierBody('Z'));
EXPECT_TRUE(isIdentifierBody('_'));
EXPECT_TRUE(isIdentifierBody('0'));
EXPECT_FALSE(isIdentifierBody('.'));
EXPECT_FALSE(isIdentifierBody('`'));
EXPECT_FALSE(isIdentifierBody('\0'));
EXPECT_FALSE(isIdentifierBody('$'));
EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true));
EXPECT_FALSE(isIdentifierBody('\x80'));
EXPECT_FALSE(isIdentifierBody('\xc2'));
EXPECT_FALSE(isIdentifierBody('\xff'));
}
TEST(CharInfoTest, isHorizontalWhitespace) {
EXPECT_FALSE(isHorizontalWhitespace('a'));
EXPECT_FALSE(isHorizontalWhitespace('_'));
EXPECT_FALSE(isHorizontalWhitespace('0'));
EXPECT_FALSE(isHorizontalWhitespace('.'));
EXPECT_FALSE(isHorizontalWhitespace('`'));
EXPECT_FALSE(isHorizontalWhitespace('\0'));
EXPECT_FALSE(isHorizontalWhitespace('\x7f'));
EXPECT_TRUE(isHorizontalWhitespace(' '));
EXPECT_TRUE(isHorizontalWhitespace('\t'));
EXPECT_TRUE(isHorizontalWhitespace('\f')); // ??
EXPECT_TRUE(isHorizontalWhitespace('\v')); // ??
EXPECT_FALSE(isHorizontalWhitespace('\n'));
EXPECT_FALSE(isHorizontalWhitespace('\r'));
EXPECT_FALSE(isHorizontalWhitespace('\x80'));
EXPECT_FALSE(isHorizontalWhitespace('\xc2'));
EXPECT_FALSE(isHorizontalWhitespace('\xff'));
}
TEST(CharInfoTest, isVerticalWhitespace) {
EXPECT_FALSE(isVerticalWhitespace('a'));
EXPECT_FALSE(isVerticalWhitespace('_'));
EXPECT_FALSE(isVerticalWhitespace('0'));
EXPECT_FALSE(isVerticalWhitespace('.'));
EXPECT_FALSE(isVerticalWhitespace('`'));
EXPECT_FALSE(isVerticalWhitespace('\0'));
EXPECT_FALSE(isVerticalWhitespace('\x7f'));
EXPECT_FALSE(isVerticalWhitespace(' '));
EXPECT_FALSE(isVerticalWhitespace('\t'));
EXPECT_FALSE(isVerticalWhitespace('\f')); // ??
EXPECT_FALSE(isVerticalWhitespace('\v')); // ??
EXPECT_TRUE(isVerticalWhitespace('\n'));
EXPECT_TRUE(isVerticalWhitespace('\r'));
EXPECT_FALSE(isVerticalWhitespace('\x80'));
EXPECT_FALSE(isVerticalWhitespace('\xc2'));
EXPECT_FALSE(isVerticalWhitespace('\xff'));
}
TEST(CharInfoTest, isWhitespace) {
EXPECT_FALSE(isWhitespace('a'));
EXPECT_FALSE(isWhitespace('_'));
EXPECT_FALSE(isWhitespace('0'));
EXPECT_FALSE(isWhitespace('.'));
EXPECT_FALSE(isWhitespace('`'));
EXPECT_FALSE(isWhitespace('\0'));
EXPECT_FALSE(isWhitespace('\x7f'));
EXPECT_TRUE(isWhitespace(' '));
EXPECT_TRUE(isWhitespace('\t'));
EXPECT_TRUE(isWhitespace('\f'));
EXPECT_TRUE(isWhitespace('\v'));
EXPECT_TRUE(isWhitespace('\n'));
EXPECT_TRUE(isWhitespace('\r'));
EXPECT_FALSE(isWhitespace('\x80'));
EXPECT_FALSE(isWhitespace('\xc2'));
EXPECT_FALSE(isWhitespace('\xff'));
}
TEST(CharInfoTest, isDigit) {
EXPECT_TRUE(isDigit('0'));
EXPECT_TRUE(isDigit('9'));
EXPECT_FALSE(isDigit('a'));
EXPECT_FALSE(isDigit('A'));
EXPECT_FALSE(isDigit('z'));
EXPECT_FALSE(isDigit('Z'));
EXPECT_FALSE(isDigit('.'));
EXPECT_FALSE(isDigit('_'));
EXPECT_FALSE(isDigit('/'));
EXPECT_FALSE(isDigit('\0'));
EXPECT_FALSE(isDigit('\x80'));
EXPECT_FALSE(isDigit('\xc2'));
EXPECT_FALSE(isDigit('\xff'));
}
TEST(CharInfoTest, isHexDigit) {
EXPECT_TRUE(isHexDigit('0'));
EXPECT_TRUE(isHexDigit('9'));
EXPECT_TRUE(isHexDigit('a'));
EXPECT_TRUE(isHexDigit('A'));
EXPECT_FALSE(isHexDigit('z'));
EXPECT_FALSE(isHexDigit('Z'));
EXPECT_FALSE(isHexDigit('.'));
EXPECT_FALSE(isHexDigit('_'));
EXPECT_FALSE(isHexDigit('/'));
EXPECT_FALSE(isHexDigit('\0'));
EXPECT_FALSE(isHexDigit('\x80'));
EXPECT_FALSE(isHexDigit('\xc2'));
EXPECT_FALSE(isHexDigit('\xff'));
}
TEST(CharInfoTest, isLetter) {
EXPECT_FALSE(isLetter('0'));
EXPECT_FALSE(isLetter('9'));
EXPECT_TRUE(isLetter('a'));
EXPECT_TRUE(isLetter('A'));
EXPECT_TRUE(isLetter('z'));
EXPECT_TRUE(isLetter('Z'));
EXPECT_FALSE(isLetter('.'));
EXPECT_FALSE(isLetter('_'));
EXPECT_FALSE(isLetter('/'));
EXPECT_FALSE(isLetter('('));
EXPECT_FALSE(isLetter('\0'));
EXPECT_FALSE(isLetter('\x80'));
EXPECT_FALSE(isLetter('\xc2'));
EXPECT_FALSE(isLetter('\xff'));
}
TEST(CharInfoTest, isLowercase) {
EXPECT_FALSE(isLowercase('0'));
EXPECT_FALSE(isLowercase('9'));
EXPECT_TRUE(isLowercase('a'));
EXPECT_FALSE(isLowercase('A'));
EXPECT_TRUE(isLowercase('z'));
EXPECT_FALSE(isLowercase('Z'));
EXPECT_FALSE(isLowercase('.'));
EXPECT_FALSE(isLowercase('_'));
EXPECT_FALSE(isLowercase('/'));
EXPECT_FALSE(isLowercase('('));
EXPECT_FALSE(isLowercase('\0'));
EXPECT_FALSE(isLowercase('\x80'));
EXPECT_FALSE(isLowercase('\xc2'));
EXPECT_FALSE(isLowercase('\xff'));
}
TEST(CharInfoTest, isUppercase) {
EXPECT_FALSE(isUppercase('0'));
EXPECT_FALSE(isUppercase('9'));
EXPECT_FALSE(isUppercase('a'));
EXPECT_TRUE(isUppercase('A'));
EXPECT_FALSE(isUppercase('z'));
EXPECT_TRUE(isUppercase('Z'));
EXPECT_FALSE(isUppercase('.'));
EXPECT_FALSE(isUppercase('_'));
EXPECT_FALSE(isUppercase('/'));
EXPECT_FALSE(isUppercase('('));
EXPECT_FALSE(isUppercase('\0'));
EXPECT_FALSE(isUppercase('\x80'));
EXPECT_FALSE(isUppercase('\xc2'));
EXPECT_FALSE(isUppercase('\xff'));
}
TEST(CharInfoTest, isAlphanumeric) {
EXPECT_TRUE(isAlphanumeric('0'));
EXPECT_TRUE(isAlphanumeric('9'));
EXPECT_TRUE(isAlphanumeric('a'));
EXPECT_TRUE(isAlphanumeric('A'));
EXPECT_TRUE(isAlphanumeric('z'));
EXPECT_TRUE(isAlphanumeric('Z'));
EXPECT_FALSE(isAlphanumeric('.'));
EXPECT_FALSE(isAlphanumeric('_'));
EXPECT_FALSE(isAlphanumeric('/'));
EXPECT_FALSE(isAlphanumeric('('));
EXPECT_FALSE(isAlphanumeric('\0'));
EXPECT_FALSE(isAlphanumeric('\x80'));
EXPECT_FALSE(isAlphanumeric('\xc2'));
EXPECT_FALSE(isAlphanumeric('\xff'));
}
TEST(CharInfoTest, isPunctuation) {
EXPECT_FALSE(isPunctuation('0'));
EXPECT_FALSE(isPunctuation('9'));
EXPECT_FALSE(isPunctuation('a'));
EXPECT_FALSE(isPunctuation('A'));
EXPECT_FALSE(isPunctuation('z'));
EXPECT_FALSE(isPunctuation('Z'));
EXPECT_TRUE(isPunctuation('.'));
EXPECT_TRUE(isPunctuation('_'));
EXPECT_TRUE(isPunctuation('/'));
EXPECT_TRUE(isPunctuation('('));
EXPECT_FALSE(isPunctuation(' '));
EXPECT_FALSE(isPunctuation('\n'));
EXPECT_FALSE(isPunctuation('\0'));
EXPECT_FALSE(isPunctuation('\x80'));
EXPECT_FALSE(isPunctuation('\xc2'));
EXPECT_FALSE(isPunctuation('\xff'));
}
TEST(CharInfoTest, isPrintable) {
EXPECT_TRUE(isPrintable('0'));
EXPECT_TRUE(isPrintable('9'));
EXPECT_TRUE(isPrintable('a'));
EXPECT_TRUE(isPrintable('A'));
EXPECT_TRUE(isPrintable('z'));
EXPECT_TRUE(isPrintable('Z'));
EXPECT_TRUE(isPrintable('.'));
EXPECT_TRUE(isPrintable('_'));
EXPECT_TRUE(isPrintable('/'));
EXPECT_TRUE(isPrintable('('));
EXPECT_TRUE(isPrintable(' '));
EXPECT_FALSE(isPrintable('\t'));
EXPECT_FALSE(isPrintable('\n'));
EXPECT_FALSE(isPrintable('\0'));
EXPECT_FALSE(isPrintable('\x80'));
EXPECT_FALSE(isPrintable('\xc2'));
EXPECT_FALSE(isPrintable('\xff'));
}
TEST(CharInfoTest, isPreprocessingNumberBody) {
EXPECT_TRUE(isPreprocessingNumberBody('0'));
EXPECT_TRUE(isPreprocessingNumberBody('9'));
EXPECT_TRUE(isPreprocessingNumberBody('a'));
EXPECT_TRUE(isPreprocessingNumberBody('A'));
EXPECT_TRUE(isPreprocessingNumberBody('z'));
EXPECT_TRUE(isPreprocessingNumberBody('Z'));
EXPECT_TRUE(isPreprocessingNumberBody('.'));
EXPECT_TRUE(isPreprocessingNumberBody('_'));
EXPECT_FALSE(isPreprocessingNumberBody('/'));
EXPECT_FALSE(isPreprocessingNumberBody('('));
EXPECT_FALSE(isPreprocessingNumberBody('\0'));
EXPECT_FALSE(isPreprocessingNumberBody('\x80'));
EXPECT_FALSE(isPreprocessingNumberBody('\xc2'));
EXPECT_FALSE(isPreprocessingNumberBody('\xff'));
}
TEST(CharInfoTest, isRawStringDelimBody) {
EXPECT_TRUE(isRawStringDelimBody('0'));
EXPECT_TRUE(isRawStringDelimBody('9'));
EXPECT_TRUE(isRawStringDelimBody('a'));
EXPECT_TRUE(isRawStringDelimBody('A'));
EXPECT_TRUE(isRawStringDelimBody('z'));
EXPECT_TRUE(isRawStringDelimBody('Z'));
EXPECT_TRUE(isRawStringDelimBody('.'));
EXPECT_TRUE(isRawStringDelimBody('_'));
EXPECT_TRUE(isRawStringDelimBody('/'));
EXPECT_FALSE(isRawStringDelimBody('('));
EXPECT_FALSE(isRawStringDelimBody('\0'));
}