forked from OSchip/llvm-project
Implement a case-folding version of DJB hash
Summary: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 llvm-svn: 325107
This commit is contained in:
parent
eb18d772a1
commit
f1440978a1
|
@ -20,6 +20,10 @@ namespace llvm {
|
|||
|
||||
/// The Bernstein hash function used by the DWARF accelerator tables.
|
||||
uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);
|
||||
|
||||
/// Computes the Bernstein hash after folding the input according to the Dwarf 5
|
||||
/// standard case folding rules.
|
||||
uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_SUPPORT_DJB_H
|
||||
|
|
|
@ -60,6 +60,10 @@ bool isPrintable(int UCS);
|
|||
/// * 1 for each of the remaining characters.
|
||||
int columnWidthUTF8(StringRef Text);
|
||||
|
||||
/// Fold input unicode character according the the Simple unicode case folding
|
||||
/// rules.
|
||||
int foldCharSimple(int C);
|
||||
|
||||
} // namespace unicode
|
||||
} // namespace sys
|
||||
} // namespace llvm
|
||||
|
|
|
@ -113,6 +113,7 @@ add_llvm_library(LLVMSupport
|
|||
Triple.cpp
|
||||
Twine.cpp
|
||||
Unicode.cpp
|
||||
UnicodeCaseFold.cpp
|
||||
YAMLParser.cpp
|
||||
YAMLTraits.cpp
|
||||
raw_os_ostream.cpp
|
||||
|
|
|
@ -12,9 +12,85 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/DJB.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/Support/Compiler.h"
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "llvm/Support/Unicode.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static inline uint32_t djbHashChar(char C, uint32_t H) {
|
||||
return (H << 5) + H + C;
|
||||
}
|
||||
|
||||
uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
|
||||
for (char C : Buffer.bytes())
|
||||
H = ((H << 5) + H) + C;
|
||||
H = djbHashChar(C, H);
|
||||
return H;
|
||||
}
|
||||
|
||||
static UTF32 chopOneUTF32(StringRef &Buffer) {
|
||||
UTF32 C;
|
||||
const UTF8 *const Begin8Const =
|
||||
reinterpret_cast<const UTF8 *>(Buffer.begin());
|
||||
const UTF8 *Begin8 = Begin8Const;
|
||||
UTF32 *Begin32 = &C;
|
||||
|
||||
// In lenient mode we will always end up with a "reasonable" value in C for
|
||||
// non-empty input.
|
||||
assert(!Buffer.empty());
|
||||
ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
|
||||
&Begin32, &C + 1, lenientConversion);
|
||||
Buffer = Buffer.drop_front(Begin8 - Begin8Const);
|
||||
return C;
|
||||
}
|
||||
|
||||
static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
|
||||
const UTF32 *Begin32 = &C;
|
||||
UTF8 *Begin8 = Storage.begin();
|
||||
|
||||
// The case-folded output should always be a valid unicode character, so use
|
||||
// strict mode here.
|
||||
ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
|
||||
Storage.end(), strictConversion);
|
||||
assert(CR == conversionOK && "Case folding produced invalid char?");
|
||||
(void)CR;
|
||||
return StringRef(reinterpret_cast<char *>(Storage.begin()),
|
||||
Begin8 - Storage.begin());
|
||||
}
|
||||
|
||||
static UTF32 foldCharDwarf(UTF32 C) {
|
||||
// DWARF v5 addition to the unicode folding rules.
|
||||
// Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
|
||||
// Above" into "i".
|
||||
if (C == 0x130 || C == 0x131)
|
||||
return 'i';
|
||||
return sys::unicode::foldCharSimple(C);
|
||||
}
|
||||
|
||||
static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
|
||||
UTF32 C = chopOneUTF32(Buffer);
|
||||
|
||||
C = foldCharDwarf(C);
|
||||
|
||||
std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
|
||||
StringRef Folded = toUTF8(C, Storage);
|
||||
return djbHash(Folded, H);
|
||||
}
|
||||
|
||||
uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
|
||||
while (!Buffer.empty()) {
|
||||
unsigned char C = Buffer.front();
|
||||
if (LLVM_LIKELY(C <= 0x7f)) {
|
||||
// US-ASCII, encoded as one character in utf-8.
|
||||
// This is by far the most common case, so handle this specially.
|
||||
if (C >= 'A' && C <= 'Z')
|
||||
C = 'a' + (C - 'A'); // fold uppercase into lowercase
|
||||
H = djbHashChar(C, H);
|
||||
Buffer = Buffer.drop_front();
|
||||
continue;
|
||||
}
|
||||
H = caseFoldingDjbHashCharSlow(Buffer, H);
|
||||
}
|
||||
return H;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,742 @@
|
|||
//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//
|
||||
//
|
||||
// This file was generated by utils/unicode-case-fold.py from the Unicode
|
||||
// case folding database at
|
||||
// http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt
|
||||
//
|
||||
// To regenerate this file, run:
|
||||
// utils/unicode-case-fold.py \
|
||||
// "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \
|
||||
// > lib/Support/UnicodeCaseFold.cpp
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/Unicode.h"
|
||||
|
||||
int llvm::sys::unicode::foldCharSimple(int C) {
|
||||
if (C < 0x0041)
|
||||
return C;
|
||||
// 26 characters
|
||||
if (C <= 0x005a)
|
||||
return C + 32;
|
||||
// MICRO SIGN
|
||||
if (C == 0x00b5)
|
||||
return 0x03bc;
|
||||
if (C < 0x00c0)
|
||||
return C;
|
||||
// 23 characters
|
||||
if (C <= 0x00d6)
|
||||
return C + 32;
|
||||
if (C < 0x00d8)
|
||||
return C;
|
||||
// 7 characters
|
||||
if (C <= 0x00de)
|
||||
return C + 32;
|
||||
if (C < 0x0100)
|
||||
return C;
|
||||
// 24 characters
|
||||
if (C <= 0x012e)
|
||||
return C | 1;
|
||||
if (C < 0x0132)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x0136)
|
||||
return C | 1;
|
||||
if (C < 0x0139)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x0147 && C % 2 == 1)
|
||||
return C + 1;
|
||||
if (C < 0x014a)
|
||||
return C;
|
||||
// 23 characters
|
||||
if (C <= 0x0176)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
if (C == 0x0178)
|
||||
return 0x00ff;
|
||||
if (C < 0x0179)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x017d && C % 2 == 1)
|
||||
return C + 1;
|
||||
// LATIN SMALL LETTER LONG S
|
||||
if (C == 0x017f)
|
||||
return 0x0073;
|
||||
// LATIN CAPITAL LETTER B WITH HOOK
|
||||
if (C == 0x0181)
|
||||
return 0x0253;
|
||||
if (C < 0x0182)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x0184)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER OPEN O
|
||||
if (C == 0x0186)
|
||||
return 0x0254;
|
||||
// LATIN CAPITAL LETTER C WITH HOOK
|
||||
if (C == 0x0187)
|
||||
return 0x0188;
|
||||
if (C < 0x0189)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x018a)
|
||||
return C + 205;
|
||||
// LATIN CAPITAL LETTER D WITH TOPBAR
|
||||
if (C == 0x018b)
|
||||
return 0x018c;
|
||||
// LATIN CAPITAL LETTER REVERSED E
|
||||
if (C == 0x018e)
|
||||
return 0x01dd;
|
||||
// LATIN CAPITAL LETTER SCHWA
|
||||
if (C == 0x018f)
|
||||
return 0x0259;
|
||||
// LATIN CAPITAL LETTER OPEN E
|
||||
if (C == 0x0190)
|
||||
return 0x025b;
|
||||
// LATIN CAPITAL LETTER F WITH HOOK
|
||||
if (C == 0x0191)
|
||||
return 0x0192;
|
||||
// LATIN CAPITAL LETTER G WITH HOOK
|
||||
if (C == 0x0193)
|
||||
return 0x0260;
|
||||
// LATIN CAPITAL LETTER GAMMA
|
||||
if (C == 0x0194)
|
||||
return 0x0263;
|
||||
// LATIN CAPITAL LETTER IOTA
|
||||
if (C == 0x0196)
|
||||
return 0x0269;
|
||||
// LATIN CAPITAL LETTER I WITH STROKE
|
||||
if (C == 0x0197)
|
||||
return 0x0268;
|
||||
// LATIN CAPITAL LETTER K WITH HOOK
|
||||
if (C == 0x0198)
|
||||
return 0x0199;
|
||||
// LATIN CAPITAL LETTER TURNED M
|
||||
if (C == 0x019c)
|
||||
return 0x026f;
|
||||
// LATIN CAPITAL LETTER N WITH LEFT HOOK
|
||||
if (C == 0x019d)
|
||||
return 0x0272;
|
||||
// LATIN CAPITAL LETTER O WITH MIDDLE TILDE
|
||||
if (C == 0x019f)
|
||||
return 0x0275;
|
||||
if (C < 0x01a0)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x01a4)
|
||||
return C | 1;
|
||||
// LATIN LETTER YR
|
||||
if (C == 0x01a6)
|
||||
return 0x0280;
|
||||
// LATIN CAPITAL LETTER TONE TWO
|
||||
if (C == 0x01a7)
|
||||
return 0x01a8;
|
||||
// LATIN CAPITAL LETTER ESH
|
||||
if (C == 0x01a9)
|
||||
return 0x0283;
|
||||
// LATIN CAPITAL LETTER T WITH HOOK
|
||||
if (C == 0x01ac)
|
||||
return 0x01ad;
|
||||
// LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
|
||||
if (C == 0x01ae)
|
||||
return 0x0288;
|
||||
// LATIN CAPITAL LETTER U WITH HORN
|
||||
if (C == 0x01af)
|
||||
return 0x01b0;
|
||||
if (C < 0x01b1)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x01b2)
|
||||
return C + 217;
|
||||
if (C < 0x01b3)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x01b5 && C % 2 == 1)
|
||||
return C + 1;
|
||||
// LATIN CAPITAL LETTER EZH
|
||||
if (C == 0x01b7)
|
||||
return 0x0292;
|
||||
if (C < 0x01b8)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x01bc && C % 4 == 0)
|
||||
return C + 1;
|
||||
// LATIN CAPITAL LETTER DZ WITH CARON
|
||||
if (C == 0x01c4)
|
||||
return 0x01c6;
|
||||
// LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
|
||||
if (C == 0x01c5)
|
||||
return 0x01c6;
|
||||
// LATIN CAPITAL LETTER LJ
|
||||
if (C == 0x01c7)
|
||||
return 0x01c9;
|
||||
// LATIN CAPITAL LETTER L WITH SMALL LETTER J
|
||||
if (C == 0x01c8)
|
||||
return 0x01c9;
|
||||
// LATIN CAPITAL LETTER NJ
|
||||
if (C == 0x01ca)
|
||||
return 0x01cc;
|
||||
if (C < 0x01cb)
|
||||
return C;
|
||||
// 9 characters
|
||||
if (C <= 0x01db && C % 2 == 1)
|
||||
return C + 1;
|
||||
if (C < 0x01de)
|
||||
return C;
|
||||
// 9 characters
|
||||
if (C <= 0x01ee)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER DZ
|
||||
if (C == 0x01f1)
|
||||
return 0x01f3;
|
||||
if (C < 0x01f2)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x01f4)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER HWAIR
|
||||
if (C == 0x01f6)
|
||||
return 0x0195;
|
||||
// LATIN CAPITAL LETTER WYNN
|
||||
if (C == 0x01f7)
|
||||
return 0x01bf;
|
||||
if (C < 0x01f8)
|
||||
return C;
|
||||
// 20 characters
|
||||
if (C <= 0x021e)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
|
||||
if (C == 0x0220)
|
||||
return 0x019e;
|
||||
if (C < 0x0222)
|
||||
return C;
|
||||
// 9 characters
|
||||
if (C <= 0x0232)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER A WITH STROKE
|
||||
if (C == 0x023a)
|
||||
return 0x2c65;
|
||||
// LATIN CAPITAL LETTER C WITH STROKE
|
||||
if (C == 0x023b)
|
||||
return 0x023c;
|
||||
// LATIN CAPITAL LETTER L WITH BAR
|
||||
if (C == 0x023d)
|
||||
return 0x019a;
|
||||
// LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
|
||||
if (C == 0x023e)
|
||||
return 0x2c66;
|
||||
// LATIN CAPITAL LETTER GLOTTAL STOP
|
||||
if (C == 0x0241)
|
||||
return 0x0242;
|
||||
// LATIN CAPITAL LETTER B WITH STROKE
|
||||
if (C == 0x0243)
|
||||
return 0x0180;
|
||||
// LATIN CAPITAL LETTER U BAR
|
||||
if (C == 0x0244)
|
||||
return 0x0289;
|
||||
// LATIN CAPITAL LETTER TURNED V
|
||||
if (C == 0x0245)
|
||||
return 0x028c;
|
||||
if (C < 0x0246)
|
||||
return C;
|
||||
// 5 characters
|
||||
if (C <= 0x024e)
|
||||
return C | 1;
|
||||
// COMBINING GREEK YPOGEGRAMMENI
|
||||
if (C == 0x0345)
|
||||
return 0x03b9;
|
||||
if (C < 0x0370)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x0372)
|
||||
return C | 1;
|
||||
// GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
|
||||
if (C == 0x0376)
|
||||
return 0x0377;
|
||||
// GREEK CAPITAL LETTER YOT
|
||||
if (C == 0x037f)
|
||||
return 0x03f3;
|
||||
// GREEK CAPITAL LETTER ALPHA WITH TONOS
|
||||
if (C == 0x0386)
|
||||
return 0x03ac;
|
||||
if (C < 0x0388)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x038a)
|
||||
return C + 37;
|
||||
// GREEK CAPITAL LETTER OMICRON WITH TONOS
|
||||
if (C == 0x038c)
|
||||
return 0x03cc;
|
||||
if (C < 0x038e)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x038f)
|
||||
return C + 63;
|
||||
if (C < 0x0391)
|
||||
return C;
|
||||
// 17 characters
|
||||
if (C <= 0x03a1)
|
||||
return C + 32;
|
||||
if (C < 0x03a3)
|
||||
return C;
|
||||
// 9 characters
|
||||
if (C <= 0x03ab)
|
||||
return C + 32;
|
||||
// GREEK SMALL LETTER FINAL SIGMA
|
||||
if (C == 0x03c2)
|
||||
return 0x03c3;
|
||||
// GREEK CAPITAL KAI SYMBOL
|
||||
if (C == 0x03cf)
|
||||
return 0x03d7;
|
||||
// GREEK BETA SYMBOL
|
||||
if (C == 0x03d0)
|
||||
return 0x03b2;
|
||||
// GREEK THETA SYMBOL
|
||||
if (C == 0x03d1)
|
||||
return 0x03b8;
|
||||
// GREEK PHI SYMBOL
|
||||
if (C == 0x03d5)
|
||||
return 0x03c6;
|
||||
// GREEK PI SYMBOL
|
||||
if (C == 0x03d6)
|
||||
return 0x03c0;
|
||||
if (C < 0x03d8)
|
||||
return C;
|
||||
// 12 characters
|
||||
if (C <= 0x03ee)
|
||||
return C | 1;
|
||||
// GREEK KAPPA SYMBOL
|
||||
if (C == 0x03f0)
|
||||
return 0x03ba;
|
||||
// GREEK RHO SYMBOL
|
||||
if (C == 0x03f1)
|
||||
return 0x03c1;
|
||||
// GREEK CAPITAL THETA SYMBOL
|
||||
if (C == 0x03f4)
|
||||
return 0x03b8;
|
||||
// GREEK LUNATE EPSILON SYMBOL
|
||||
if (C == 0x03f5)
|
||||
return 0x03b5;
|
||||
// GREEK CAPITAL LETTER SHO
|
||||
if (C == 0x03f7)
|
||||
return 0x03f8;
|
||||
// GREEK CAPITAL LUNATE SIGMA SYMBOL
|
||||
if (C == 0x03f9)
|
||||
return 0x03f2;
|
||||
// GREEK CAPITAL LETTER SAN
|
||||
if (C == 0x03fa)
|
||||
return 0x03fb;
|
||||
if (C < 0x03fd)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x03ff)
|
||||
return C + -130;
|
||||
if (C < 0x0400)
|
||||
return C;
|
||||
// 16 characters
|
||||
if (C <= 0x040f)
|
||||
return C + 80;
|
||||
if (C < 0x0410)
|
||||
return C;
|
||||
// 32 characters
|
||||
if (C <= 0x042f)
|
||||
return C + 32;
|
||||
if (C < 0x0460)
|
||||
return C;
|
||||
// 17 characters
|
||||
if (C <= 0x0480)
|
||||
return C | 1;
|
||||
if (C < 0x048a)
|
||||
return C;
|
||||
// 27 characters
|
||||
if (C <= 0x04be)
|
||||
return C | 1;
|
||||
// CYRILLIC LETTER PALOCHKA
|
||||
if (C == 0x04c0)
|
||||
return 0x04cf;
|
||||
if (C < 0x04c1)
|
||||
return C;
|
||||
// 7 characters
|
||||
if (C <= 0x04cd && C % 2 == 1)
|
||||
return C + 1;
|
||||
if (C < 0x04d0)
|
||||
return C;
|
||||
// 48 characters
|
||||
if (C <= 0x052e)
|
||||
return C | 1;
|
||||
if (C < 0x0531)
|
||||
return C;
|
||||
// 38 characters
|
||||
if (C <= 0x0556)
|
||||
return C + 48;
|
||||
if (C < 0x10a0)
|
||||
return C;
|
||||
// 38 characters
|
||||
if (C <= 0x10c5)
|
||||
return C + 7264;
|
||||
if (C < 0x10c7)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x10cd && C % 6 == 5)
|
||||
return C + 7264;
|
||||
if (C < 0x13f8)
|
||||
return C;
|
||||
// 6 characters
|
||||
if (C <= 0x13fd)
|
||||
return C + -8;
|
||||
// CYRILLIC SMALL LETTER ROUNDED VE
|
||||
if (C == 0x1c80)
|
||||
return 0x0432;
|
||||
// CYRILLIC SMALL LETTER LONG-LEGGED DE
|
||||
if (C == 0x1c81)
|
||||
return 0x0434;
|
||||
// CYRILLIC SMALL LETTER NARROW O
|
||||
if (C == 0x1c82)
|
||||
return 0x043e;
|
||||
if (C < 0x1c83)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1c84)
|
||||
return C + -6210;
|
||||
// CYRILLIC SMALL LETTER THREE-LEGGED TE
|
||||
if (C == 0x1c85)
|
||||
return 0x0442;
|
||||
// CYRILLIC SMALL LETTER TALL HARD SIGN
|
||||
if (C == 0x1c86)
|
||||
return 0x044a;
|
||||
// CYRILLIC SMALL LETTER TALL YAT
|
||||
if (C == 0x1c87)
|
||||
return 0x0463;
|
||||
// CYRILLIC SMALL LETTER UNBLENDED UK
|
||||
if (C == 0x1c88)
|
||||
return 0xa64b;
|
||||
if (C < 0x1e00)
|
||||
return C;
|
||||
// 75 characters
|
||||
if (C <= 0x1e94)
|
||||
return C | 1;
|
||||
// LATIN SMALL LETTER LONG S WITH DOT ABOVE
|
||||
if (C == 0x1e9b)
|
||||
return 0x1e61;
|
||||
// LATIN CAPITAL LETTER SHARP S
|
||||
if (C == 0x1e9e)
|
||||
return 0x00df;
|
||||
if (C < 0x1ea0)
|
||||
return C;
|
||||
// 48 characters
|
||||
if (C <= 0x1efe)
|
||||
return C | 1;
|
||||
if (C < 0x1f08)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f0f)
|
||||
return C + -8;
|
||||
if (C < 0x1f18)
|
||||
return C;
|
||||
// 6 characters
|
||||
if (C <= 0x1f1d)
|
||||
return C + -8;
|
||||
if (C < 0x1f28)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f2f)
|
||||
return C + -8;
|
||||
if (C < 0x1f38)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f3f)
|
||||
return C + -8;
|
||||
if (C < 0x1f48)
|
||||
return C;
|
||||
// 6 characters
|
||||
if (C <= 0x1f4d)
|
||||
return C + -8;
|
||||
if (C < 0x1f59)
|
||||
return C;
|
||||
// 4 characters
|
||||
if (C <= 0x1f5f && C % 2 == 1)
|
||||
return C + -8;
|
||||
if (C < 0x1f68)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f6f)
|
||||
return C + -8;
|
||||
if (C < 0x1f88)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f8f)
|
||||
return C + -8;
|
||||
if (C < 0x1f98)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1f9f)
|
||||
return C + -8;
|
||||
if (C < 0x1fa8)
|
||||
return C;
|
||||
// 8 characters
|
||||
if (C <= 0x1faf)
|
||||
return C + -8;
|
||||
if (C < 0x1fb8)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1fb9)
|
||||
return C + -8;
|
||||
if (C < 0x1fba)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1fbb)
|
||||
return C + -74;
|
||||
// GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
|
||||
if (C == 0x1fbc)
|
||||
return 0x1fb3;
|
||||
// GREEK PROSGEGRAMMENI
|
||||
if (C == 0x1fbe)
|
||||
return 0x03b9;
|
||||
if (C < 0x1fc8)
|
||||
return C;
|
||||
// 4 characters
|
||||
if (C <= 0x1fcb)
|
||||
return C + -86;
|
||||
// GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
|
||||
if (C == 0x1fcc)
|
||||
return 0x1fc3;
|
||||
if (C < 0x1fd8)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1fd9)
|
||||
return C + -8;
|
||||
if (C < 0x1fda)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1fdb)
|
||||
return C + -100;
|
||||
if (C < 0x1fe8)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1fe9)
|
||||
return C + -8;
|
||||
if (C < 0x1fea)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1feb)
|
||||
return C + -112;
|
||||
// GREEK CAPITAL LETTER RHO WITH DASIA
|
||||
if (C == 0x1fec)
|
||||
return 0x1fe5;
|
||||
if (C < 0x1ff8)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1ff9)
|
||||
return C + -128;
|
||||
if (C < 0x1ffa)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x1ffb)
|
||||
return C + -126;
|
||||
// GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
|
||||
if (C == 0x1ffc)
|
||||
return 0x1ff3;
|
||||
// OHM SIGN
|
||||
if (C == 0x2126)
|
||||
return 0x03c9;
|
||||
// KELVIN SIGN
|
||||
if (C == 0x212a)
|
||||
return 0x006b;
|
||||
// ANGSTROM SIGN
|
||||
if (C == 0x212b)
|
||||
return 0x00e5;
|
||||
// TURNED CAPITAL F
|
||||
if (C == 0x2132)
|
||||
return 0x214e;
|
||||
if (C < 0x2160)
|
||||
return C;
|
||||
// 16 characters
|
||||
if (C <= 0x216f)
|
||||
return C + 16;
|
||||
// ROMAN NUMERAL REVERSED ONE HUNDRED
|
||||
if (C == 0x2183)
|
||||
return 0x2184;
|
||||
if (C < 0x24b6)
|
||||
return C;
|
||||
// 26 characters
|
||||
if (C <= 0x24cf)
|
||||
return C + 26;
|
||||
if (C < 0x2c00)
|
||||
return C;
|
||||
// 47 characters
|
||||
if (C <= 0x2c2e)
|
||||
return C + 48;
|
||||
// LATIN CAPITAL LETTER L WITH DOUBLE BAR
|
||||
if (C == 0x2c60)
|
||||
return 0x2c61;
|
||||
// LATIN CAPITAL LETTER L WITH MIDDLE TILDE
|
||||
if (C == 0x2c62)
|
||||
return 0x026b;
|
||||
// LATIN CAPITAL LETTER P WITH STROKE
|
||||
if (C == 0x2c63)
|
||||
return 0x1d7d;
|
||||
// LATIN CAPITAL LETTER R WITH TAIL
|
||||
if (C == 0x2c64)
|
||||
return 0x027d;
|
||||
if (C < 0x2c67)
|
||||
return C;
|
||||
// 3 characters
|
||||
if (C <= 0x2c6b && C % 2 == 1)
|
||||
return C + 1;
|
||||
// LATIN CAPITAL LETTER ALPHA
|
||||
if (C == 0x2c6d)
|
||||
return 0x0251;
|
||||
// LATIN CAPITAL LETTER M WITH HOOK
|
||||
if (C == 0x2c6e)
|
||||
return 0x0271;
|
||||
// LATIN CAPITAL LETTER TURNED A
|
||||
if (C == 0x2c6f)
|
||||
return 0x0250;
|
||||
// LATIN CAPITAL LETTER TURNED ALPHA
|
||||
if (C == 0x2c70)
|
||||
return 0x0252;
|
||||
if (C < 0x2c72)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x2c75 && C % 3 == 2)
|
||||
return C + 1;
|
||||
if (C < 0x2c7e)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x2c7f)
|
||||
return C + -10815;
|
||||
if (C < 0x2c80)
|
||||
return C;
|
||||
// 50 characters
|
||||
if (C <= 0x2ce2)
|
||||
return C | 1;
|
||||
if (C < 0x2ceb)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0x2ced && C % 2 == 1)
|
||||
return C + 1;
|
||||
if (C < 0x2cf2)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0xa640 && C % 31054 == 11506)
|
||||
return C + 1;
|
||||
if (C < 0xa642)
|
||||
return C;
|
||||
// 22 characters
|
||||
if (C <= 0xa66c)
|
||||
return C | 1;
|
||||
if (C < 0xa680)
|
||||
return C;
|
||||
// 14 characters
|
||||
if (C <= 0xa69a)
|
||||
return C | 1;
|
||||
if (C < 0xa722)
|
||||
return C;
|
||||
// 7 characters
|
||||
if (C <= 0xa72e)
|
||||
return C | 1;
|
||||
if (C < 0xa732)
|
||||
return C;
|
||||
// 31 characters
|
||||
if (C <= 0xa76e)
|
||||
return C | 1;
|
||||
if (C < 0xa779)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0xa77b && C % 2 == 1)
|
||||
return C + 1;
|
||||
// LATIN CAPITAL LETTER INSULAR G
|
||||
if (C == 0xa77d)
|
||||
return 0x1d79;
|
||||
if (C < 0xa77e)
|
||||
return C;
|
||||
// 5 characters
|
||||
if (C <= 0xa786)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER SALTILLO
|
||||
if (C == 0xa78b)
|
||||
return 0xa78c;
|
||||
// LATIN CAPITAL LETTER TURNED H
|
||||
if (C == 0xa78d)
|
||||
return 0x0265;
|
||||
if (C < 0xa790)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0xa792)
|
||||
return C | 1;
|
||||
if (C < 0xa796)
|
||||
return C;
|
||||
// 10 characters
|
||||
if (C <= 0xa7a8)
|
||||
return C | 1;
|
||||
// LATIN CAPITAL LETTER H WITH HOOK
|
||||
if (C == 0xa7aa)
|
||||
return 0x0266;
|
||||
// LATIN CAPITAL LETTER REVERSED OPEN E
|
||||
if (C == 0xa7ab)
|
||||
return 0x025c;
|
||||
// LATIN CAPITAL LETTER SCRIPT G
|
||||
if (C == 0xa7ac)
|
||||
return 0x0261;
|
||||
// LATIN CAPITAL LETTER L WITH BELT
|
||||
if (C == 0xa7ad)
|
||||
return 0x026c;
|
||||
// LATIN CAPITAL LETTER SMALL CAPITAL I
|
||||
if (C == 0xa7ae)
|
||||
return 0x026a;
|
||||
// LATIN CAPITAL LETTER TURNED K
|
||||
if (C == 0xa7b0)
|
||||
return 0x029e;
|
||||
// LATIN CAPITAL LETTER TURNED T
|
||||
if (C == 0xa7b1)
|
||||
return 0x0287;
|
||||
// LATIN CAPITAL LETTER J WITH CROSSED-TAIL
|
||||
if (C == 0xa7b2)
|
||||
return 0x029d;
|
||||
// LATIN CAPITAL LETTER CHI
|
||||
if (C == 0xa7b3)
|
||||
return 0xab53;
|
||||
if (C < 0xa7b4)
|
||||
return C;
|
||||
// 2 characters
|
||||
if (C <= 0xa7b6)
|
||||
return C | 1;
|
||||
if (C < 0xab70)
|
||||
return C;
|
||||
// 80 characters
|
||||
if (C <= 0xabbf)
|
||||
return C + -38864;
|
||||
if (C < 0xff21)
|
||||
return C;
|
||||
// 26 characters
|
||||
if (C <= 0xff3a)
|
||||
return C + 32;
|
||||
if (C < 0x10400)
|
||||
return C;
|
||||
// 40 characters
|
||||
if (C <= 0x10427)
|
||||
return C + 40;
|
||||
if (C < 0x104b0)
|
||||
return C;
|
||||
// 36 characters
|
||||
if (C <= 0x104d3)
|
||||
return C + 40;
|
||||
if (C < 0x10c80)
|
||||
return C;
|
||||
// 51 characters
|
||||
if (C <= 0x10cb2)
|
||||
return C + 64;
|
||||
if (C < 0x118a0)
|
||||
return C;
|
||||
// 32 characters
|
||||
if (C <= 0x118bf)
|
||||
return C + 32;
|
||||
if (C < 0x1e900)
|
||||
return C;
|
||||
// 34 characters
|
||||
if (C <= 0x1e921)
|
||||
return C + 34;
|
||||
|
||||
return C;
|
||||
}
|
|
@ -19,6 +19,7 @@ add_llvm_unittest(SupportTests
|
|||
ConvertUTFTest.cpp
|
||||
DataExtractorTest.cpp
|
||||
DebugTest.cpp
|
||||
DJBTest.cpp
|
||||
EndianStreamTest.cpp
|
||||
EndianTest.cpp
|
||||
ErrnoTest.cpp
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
//===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/DJB.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
TEST(DJBTest, caseFolding) {
|
||||
struct TestCase {
|
||||
StringLiteral One;
|
||||
StringLiteral Two;
|
||||
};
|
||||
|
||||
static constexpr TestCase Tests[] = {
|
||||
{"ASDF", "asdf"},
|
||||
{"qWeR", "QwEr"},
|
||||
{"qqqqqqqqqqqqqqqqqqqq", "QQQQQQQQQQQQQQQQQQQQ"},
|
||||
|
||||
{"I", "i"},
|
||||
// Latin Small Letter Dotless I
|
||||
{u8"\u0130", "i"},
|
||||
// Latin Capital Letter I With Dot Above
|
||||
{u8"\u0131", "i"},
|
||||
|
||||
// Latin Capital Letter A With Grave
|
||||
{u8"\u00c0", u8"\u00e0"},
|
||||
// Latin Capital Letter A With Macron
|
||||
{u8"\u0100", u8"\u0101"},
|
||||
// Latin Capital Letter L With Acute
|
||||
{u8"\u0139", u8"\u013a"},
|
||||
// Cyrillic Capital Letter Ie
|
||||
{u8"\u0415", u8"\u0435"},
|
||||
// Latin Capital Letter A With Circumflex And Grave
|
||||
{u8"\u1ea6", u8"\u1ea7"},
|
||||
// Kelvin Sign
|
||||
{u8"\u212a", u8"\u006b"},
|
||||
// Glagolitic Capital Letter Chrivi
|
||||
{u8"\u2c1d", u8"\u2c4d"},
|
||||
// Fullwidth Latin Capital Letter M
|
||||
{u8"\uff2d", u8"\uff4d"},
|
||||
// Old Hungarian Capital Letter Ej
|
||||
{u8"\U00010c92", u8"\U00010cd2"},
|
||||
};
|
||||
|
||||
for (const TestCase &T : Tests) {
|
||||
SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'");
|
||||
EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DJBTest, knownValuesLowerCase) {
|
||||
struct TestCase {
|
||||
StringLiteral Text;
|
||||
uint32_t Hash;
|
||||
};
|
||||
static constexpr TestCase Tests[] = {
|
||||
{"", 5381u},
|
||||
{"f", 177675u},
|
||||
{"fo", 5863386u},
|
||||
{"foo", 193491849u},
|
||||
{"foob", 2090263819u},
|
||||
{"fooba", 259229388u},
|
||||
{"foobar", 4259602622u},
|
||||
{"pneumonoultramicroscopicsilicovolcanoconiosis", 3999417781u},
|
||||
};
|
||||
|
||||
for (const TestCase &T : Tests) {
|
||||
SCOPED_TRACE("Text: '" + T.Text + "'");
|
||||
EXPECT_EQ(T.Hash, djbHash(T.Text));
|
||||
EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text));
|
||||
EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper()));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DJBTest, knownValuesUnicode) {
|
||||
EXPECT_EQ(
|
||||
2326183139u,
|
||||
caseFoldingDjbHash(
|
||||
u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
|
||||
u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Unicode case folding database conversion utility
|
||||
|
||||
Parses the database and generates a C++ function which implements the case
|
||||
folding algorithm. The database entries are of the form:
|
||||
|
||||
<code>; <status>; <mapping>; # <name>
|
||||
|
||||
<status> can be one of four characters:
|
||||
C - Common mappings
|
||||
S - mappings for Simple case folding
|
||||
F - mappings for Full case folding
|
||||
T - special case for Turkish I characters
|
||||
|
||||
Right now this generates a function which implements simple case folding (C+S
|
||||
entries).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
# This variable will body of the mappings function
|
||||
body = ""
|
||||
|
||||
# Reads file line-by-line, extracts Common and Simple case fold mappings and
|
||||
# returns a (from_char, to_char, from_name) tuple.
|
||||
def mappings(f):
|
||||
previous_from = -1
|
||||
expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
|
||||
for line in f:
|
||||
m = expr.match(line)
|
||||
if not m: continue
|
||||
from_char = int(m.group(1), 16)
|
||||
to_char = int(m.group(2), 16)
|
||||
from_name = m.group(3)
|
||||
|
||||
if from_char <= previous_from:
|
||||
raise Exception("Duplicate or unsorted characters in input")
|
||||
yield from_char, to_char, from_name
|
||||
previous_from = from_char
|
||||
|
||||
# Computes the shift (to_char - from_char) in a mapping.
|
||||
def shift(mapping):
|
||||
return mapping[1] - mapping[0]
|
||||
|
||||
# Computes the stride (from_char2 - from_char1) of two mappings.
|
||||
def stride2(mapping1, mapping2):
|
||||
return mapping2[0] - mapping1[0]
|
||||
|
||||
# Computes the stride of a list of mappings. The list should have at least two
|
||||
# mappings. All mappings in the list are assumed to have the same stride.
|
||||
def stride(block):
|
||||
return stride2(block[0], block[1])
|
||||
|
||||
|
||||
# b is a list of mappings. All the mappings are assumed to have the same
|
||||
# shift and the stride between adjecant mappings (if any) is constant.
|
||||
def dump_block(b):
|
||||
global body
|
||||
|
||||
if len(b) == 1:
|
||||
# Special case for handling blocks of length 1. We don't even need to
|
||||
# emit the "if (C < X) return C" check below as all characters in this
|
||||
# range will be caught by the "C < X" check emitted by the first
|
||||
# non-trivial block.
|
||||
body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
|
||||
return
|
||||
|
||||
first = b[0][0]
|
||||
last = first + stride(b) * (len(b)-1)
|
||||
modulo = first % stride(b)
|
||||
|
||||
# All characters before this block map to themselves.
|
||||
body += " if (C < {0:#06x})\n return C;\n".format(first)
|
||||
body += " // {0} characters\n".format(len(b))
|
||||
|
||||
# Generic pattern: check upper bound (lower bound is checked by the "if"
|
||||
# above) and modulo of C, return C+shift.
|
||||
pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
|
||||
|
||||
if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
|
||||
# Special case:
|
||||
# We can elide the modulo-check because the expression "C|1" will map
|
||||
# the intervening characters to themselves.
|
||||
pattern = " if (C <= {0:#06x})\n return C | 1;\n"
|
||||
elif stride(b) == 1:
|
||||
# Another special case: X % 1 is always zero, so don't emit the
|
||||
# modulo-check.
|
||||
pattern = " if (C <= {0:#06x})\n return C + {3};\n"
|
||||
|
||||
body += pattern.format(last, stride(b), modulo, shift(b[0]))
|
||||
|
||||
current_block = []
|
||||
f = urllib2.urlopen(sys.argv[1])
|
||||
for m in mappings(f):
|
||||
if len(current_block) == 0:
|
||||
current_block.append(m)
|
||||
continue
|
||||
|
||||
if shift(current_block[0]) != shift(m):
|
||||
# Incompatible shift, start a new block.
|
||||
dump_block(current_block)
|
||||
current_block = [m]
|
||||
continue
|
||||
|
||||
if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
|
||||
current_block.append(m)
|
||||
continue
|
||||
|
||||
# Incompatible stride, start a new block.
|
||||
dump_block(current_block)
|
||||
current_block = [m]
|
||||
f.close()
|
||||
|
||||
dump_block(current_block)
|
||||
|
||||
print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//'
|
||||
print '//'
|
||||
print '// This file was generated by utils/unicode-case-fold.py from the Unicode'
|
||||
print '// case folding database at'
|
||||
print '// ', sys.argv[1]
|
||||
print '//'
|
||||
print '// To regenerate this file, run:'
|
||||
print '// utils/unicode-case-fold.py \\'
|
||||
print '// "{}" \\'.format(sys.argv[1])
|
||||
print '// > lib/Support/UnicodeCaseFold.cpp'
|
||||
print '//'
|
||||
print '//===----------------------------------------------------------------------===//'
|
||||
print ''
|
||||
print '#include "llvm/Support/Unicode.h"'
|
||||
print ''
|
||||
print "int llvm::sys::unicode::foldCharSimple(int C) {"
|
||||
print body
|
||||
print " return C;"
|
||||
print "}"
|
Loading…
Reference in New Issue