llvm-project/llvm/lib/Support/UnicodeNameToCodepoint.cpp

552 lines
17 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
//-*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements functions to map the name or alias of a unicode
// character to its codepoint.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Unicode.h"
namespace llvm {
namespace sys {
namespace unicode {
extern const char *UnicodeNameToCodepointDict;
extern const uint8_t *UnicodeNameToCodepointIndex;
extern const std::size_t UnicodeNameToCodepointIndexSize;
extern const std::size_t UnicodeNameToCodepointLargestNameSize;
using BufferType = SmallString<64>;
struct Node {
bool IsRoot = false;
char32_t Value = 0xFFFFFFFF;
uint32_t ChildrenOffset = 0;
bool HasSibling = false;
uint32_t Size = 0;
StringRef Name;
const Node *Parent = nullptr;
constexpr bool isValid() const {
return !Name.empty() || Value == 0xFFFFFFFF;
}
constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
std::string fullName() const {
std::string S;
// Reserve enough space for most unicode code points.
// The chosen value represent the 99th percentile of name size as of
// Unicode 14.
S.reserve(46);
const Node *N = this;
while (N) {
std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
N = N->Parent;
}
std::reverse(S.begin(), S.end());
return S;
}
};
static Node createRoot() {
Node N;
N.IsRoot = true;
N.ChildrenOffset = 1;
N.Size = 1;
return N;
}
static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
if (Offset == 0)
return createRoot();
uint32_t Origin = Offset;
Node N;
N.Parent = Parent;
uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
return N;
bool LongName = NameInfo & 0x40;
bool HasValue = NameInfo & 0x80;
std::size_t Size = NameInfo & ~0xC0;
if (LongName) {
uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
NameOffset |= UnicodeNameToCodepointIndex[Offset++];
N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
} else {
N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
}
if (HasValue) {
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
uint8_t M = UnicodeNameToCodepointIndex[Offset++];
uint8_t L = UnicodeNameToCodepointIndex[Offset++];
N.Value = ((H << 16) | (M << 8) | L) >> 3;
bool HasChildren = L & 0x02;
N.HasSibling = L & 0x01;
if (HasChildren) {
N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
}
} else {
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
N.HasSibling = H & 0x80;
bool HasChildren = H & 0x40;
H &= ~0xC0;
if (HasChildren) {
N.ChildrenOffset = (H << 16);
N.ChildrenOffset |=
(uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
}
}
N.Size = Offset - Origin;
return N;
}
static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
std::size_t &Consummed, char &PreviousCharInName,
char &PreviousCharInNeedle, bool IsPrefix = false) {
Consummed = 0;
if (Strict) {
if (!Name.startswith(Needle))
return false;
Consummed = Needle.size();
return true;
}
if (Needle.empty())
return true;
auto NamePos = Name.begin();
auto NeedlePos = Needle.begin();
char PreviousCharInNameOrigin = PreviousCharInName;
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
bool IgnoreEnd = false) {
while (It != End) {
const auto Next = std::next(It);
// Ignore spaces, underscore, medial hyphens
// https://unicode.org/reports/tr44/#UAX44-LM2.
bool Ignore =
*It == ' ' || *It == '_' ||
(*It == '-' && isAlnum(PreviousChar) &&
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
PreviousChar = *It;
if (!Ignore)
break;
++It;
}
return It;
};
while (true) {
NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
NeedlePos =
IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
if (NeedlePos == Needle.end())
break;
if (NamePos == Name.end())
break;
if (toUpper(*NeedlePos) != toUpper(*NamePos))
break;
NeedlePos++;
NamePos++;
}
Consummed = std::distance(Name.begin(), NamePos);
if (NeedlePos != Needle.end()) {
PreviousCharInName = PreviousCharInNameOrigin;
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
}
return NeedlePos == Needle.end();
}
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict,
char PreviousCharInName, char PreviousCharInNeedle,
BufferType &Buffer, const Node *Parent = nullptr) {
Node N = readNode(Offset, Parent);
std::size_t Consummed = 0;
bool DoesStartWith =
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName, PreviousCharInNeedle);
if (!DoesStartWith)
return std::make_tuple(N, false, 0);
if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
return std::make_tuple(N, true, N.Value);
if (N.hasChildren()) {
uint32_t ChildOffset = N.ChildrenOffset;
for (;;) {
Node C;
bool Matches;
uint32_t Value;
std::tie(C, Matches, Value) =
compareNode(ChildOffset, Name.substr(Consummed), Strict,
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
if (Matches) {
std::reverse_copy(C.Name.begin(), C.Name.end(),
std::back_inserter(Buffer));
return std::make_tuple(N, true, Value);
}
ChildOffset += C.Size;
if (!C.HasSibling)
break;
}
}
return std::make_tuple(N, false, 0);
}
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
}
// clang-format off
constexpr const char *const HangulSyllables[][3] = {
{ "G", "A", "" },
{ "GG", "AE", "G" },
{ "N", "YA", "GG" },
{ "D", "YAE", "GS" },
{ "DD", "EO", "N", },
{ "R", "E", "NJ" },
{ "M", "YEO", "NH" },
{ "B", "YE", "D" },
{ "BB", "O", "L" },
{ "S", "WA", "LG" },
{ "SS", "WAE", "LM" },
{ "", "OE", "LB" },
{ "J", "YO", "LS" },
{ "JJ", "U", "LT" },
{ "C", "WEO", "LP" },
{ "K", "WE", "LH" },
{ "T", "WI", "M" },
{ "P", "YU", "B" },
{ "H", "EU", "BS" },
{ 0, "YI", "S" },
{ 0, "I", "SS" },
{ 0, 0, "NG" },
{ 0, 0, "J" },
{ 0, 0, "C" },
{ 0, 0, "K" },
{ 0, 0, "T" },
{ 0, 0, "P" },
{ 0, 0, "H" }
};
// clang-format on
// Unicode 14.0
// 3.12 Conjoining Jamo Behavior Common constants
constexpr const char32_t SBase = 0xAC00;
constexpr const uint32_t LCount = 19;
constexpr const uint32_t VCount = 21;
constexpr const uint32_t TCount = 28;
static std::size_t findSyllable(StringRef Name, bool Strict,
char &PreviousInName, int &Pos, int Column) {
assert(Column == 0 || Column == 1 || Column == 2);
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
char NeedleStart = 0;
int Len = -1;
int Prev = PreviousInName;
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
StringRef Syllable(HangulSyllables[I][Column]);
if (int(Syllable.size()) <= Len)
continue;
std::size_t Consummed = 0;
char PreviousInNameCopy = PreviousInName;
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
PreviousInNameCopy, NeedleStart);
if (!DoesStartWith)
continue;
Len = Consummed;
Pos = I;
Prev = PreviousInNameCopy;
}
if (Len == -1)
return 0;
PreviousInName = Prev;
return size_t(Len);
}
static llvm::Optional<char32_t>
nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
Buffer.clear();
// Hangul Syllable Decomposition
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
NameStart, NeedleStart);
if (!DoesStartWith)
return None;
Name = Name.substr(Consummed);
int L = -1, V = -1, T = -1;
Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
if (L != -1 && V != -1 && T != -1 && Name.empty()) {
if (!Strict) {
Buffer.append("HANGUL SYLLABLE ");
if (L != -1)
Buffer.append(HangulSyllables[L][0]);
if (V != -1)
Buffer.append(HangulSyllables[V][1]);
if (T != -1)
Buffer.append(HangulSyllables[T][2]);
}
return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
std::uint32_t(T);
}
// Otherwise, it's an illegal syllable name.
return None;
}
struct GeneratedNamesData {
StringRef Prefix;
uint32_t Start;
uint32_t End;
};
// Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
// This needs to be kept in sync with
// llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
static const GeneratedNamesData GeneratedNamesDataTable[] = {
{"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
{"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
{"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
{"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
{"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
{"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
{"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
{"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
{"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
{"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
{"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
{"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
{"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
{"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
{"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
};
static llvm::Optional<char32_t>
nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
for (auto &&Item : GeneratedNamesDataTable) {
Buffer.clear();
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
NameStart, NeedleStart, /*isPrefix*/ true);
if (!DoesStartWith)
continue;
auto Number = Name.substr(Consummed);
unsigned long long V = 0;
// Be consistent about mandating upper casing.
if (Strict &&
llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
return {};
if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
continue;
if (!Strict) {
Buffer.append(Item.Prefix);
Buffer.append(utohexstr(V, true));
}
return V;
}
return None;
}
static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
BufferType &Buffer) {
if (Name.empty())
return None;
llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
if (!Res)
Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
if (Res)
return *Res;
Buffer.clear();
Node Node;
bool Matches;
uint32_t Value;
std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
if (Matches) {
std::reverse(Buffer.begin(), Buffer.end());
// UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
// hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
if (!Strict && Value == 0x116c &&
Name.find_insensitive("O-E") != StringRef::npos) {
Buffer = "HANGUL JUNGSEONG O-E";
Value = 0x1180;
}
return Value;
}
return None;
}
llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
BufferType Buffer;
auto Opt = nameToCodepoint(Name, true, Buffer);
return Opt;
}
llvm::Optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name) {
BufferType Buffer;
auto Opt = nameToCodepoint(Name, false, Buffer);
if (!Opt)
return None;
return LooseMatchingResult{*Opt, Buffer};
}
// Find the unicode character whose editing distance to Pattern
// is shortest, using the WagnerFischer algorithm.
llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
// We maintain a fixed size vector of matches,
// sorted by distance
// The worst match (with the biggest distance) are discarded when new elements
// are added.
std::size_t LargestEditDistance = 0;
llvm::SmallVector<MatchForCodepointName> Matches;
Matches.reserve(MaxMatchesCount + 1);
auto Insert = [&](const Node &Node, uint32_t Distance,
char32_t Value) -> bool {
if (Distance > LargestEditDistance) {
if (Matches.size() == MaxMatchesCount)
return false;
LargestEditDistance = Distance;
}
// To avoid allocations, the creation of the name is delayed
// as much as possible.
std::string Name;
auto GetName = [&] {
if (Name.empty())
Name = Node.fullName();
return Name;
};
auto It = std::lower_bound(
Matches.begin(), Matches.end(), Distance,
[&](const MatchForCodepointName &a, std::size_t Distance) {
if (Distance == a.Distance)
return a.Name < GetName();
return a.Distance < Distance;
});
if (It == Matches.end() && Matches.size() == MaxMatchesCount)
return false;
MatchForCodepointName M{GetName(), Distance, Value};
Matches.insert(It, std::move(M));
if (Matches.size() > MaxMatchesCount)
Matches.pop_back();
return true;
};
// We ignore case, space, hyphens, etc,
// in both the search pattern and the prospective names.
auto Normalize = [](StringRef Name) {
std::string Out;
Out.reserve(Name.size());
for (char C : Name) {
if (isAlnum(C))
Out.push_back(toUpper(C));
}
return Out;
};
std::string NormalizedName = Normalize(Pattern);
// Allocate a matrix big enough for longest names.
const std::size_t Columns =
std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
1;
LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
UnicodeNameToCodepointLargestNameSize + 1;
std::vector<char> Distances(
Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
assert(Column < Columns);
assert(Row < Rows);
return Distances[Row * Columns + Column];
};
for (std::size_t I = 0; I < Columns; I++)
Get(I, 0) = I;
// Visit the childrens,
// Filling (and overriding) the matrix for the name fragment of each node
// iteratively. CompleteName is used to collect the actual name of potential
// match, respecting case and spacing.
auto VisitNode = [&](const Node &N, std::size_t Row,
auto &VisitNode) -> void {
std::size_t J = 0;
for (; J < N.Name.size(); J++) {
if (!isAlnum(N.Name[J]))
continue;
Get(0, Row) = Row;
for (std::size_t I = 1; I < Columns; I++) {
const int Delete = Get(I - 1, Row) + 1;
const int Insert = Get(I, Row - 1) + 1;
const int Replace =
Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
}
Row++;
}
unsigned Cost = Get(Columns - 1, Row - 1);
if (N.Value != 0xFFFFFFFF) {
Insert(N, Cost, N.Value);
}
if (N.hasChildren()) {
auto ChildOffset = N.ChildrenOffset;
for (;;) {
Node C = readNode(ChildOffset, &N);
ChildOffset += C.Size;
if (!C.isValid())
break;
VisitNode(C, Row, VisitNode);
if (!C.HasSibling)
break;
}
}
};
Node Root = createRoot();
VisitNode(Root, 1, VisitNode);
return Matches;
}
} // namespace unicode
} // namespace sys
} // namespace llvm