Comment parsing: resolve more named character references

This reimplements r173850 with a better approach:
(1) use a TableGen-generated matcher instead of doing a linear search;
(2) avoid allocations for new strings by converting code points to string
    iterals with TableGen.

llvm-svn: 173931
This commit is contained in:
Dmitri Gribenko 2013-01-30 14:29:28 +00:00
parent 32832e6176
commit 28800da1b3
10 changed files with 306 additions and 190 deletions

View File

@ -33,6 +33,10 @@ clang_tablegen(CommentHTMLTagsProperties.inc -gen-clang-comment-html-tags-proper
SOURCE CommentHTMLTags.td
TARGET ClangCommentHTMLTagsProperties)
clang_tablegen(CommentHTMLNamedCharacterReferences.inc -gen-clang-comment-html-named-character-references
SOURCE CommentHTMLNamedCharacterReferences.td
TARGET ClangCommentHTMLNamedCharacterReferences)
clang_tablegen(CommentCommandInfo.inc -gen-clang-comment-command-info
SOURCE CommentCommands.td
TARGET ClangCommentCommandInfo)

View File

@ -0,0 +1,177 @@
// HTML Named Character Reference
class NCR<string spelling, int codePoint> {
string Spelling = spelling;
int CodePoint = codePoint;
}
// The list below includes named character references supported by Doxygen:
// http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html
//
// It does not include all HTML 5 named character references.
//
// Corresponding code point values can be found here:
// http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
def : NCR<"copy", 0x000A9>;
def : NCR<"COPY", 0x000A9>;
def : NCR<"trade", 0x02122>;
def : NCR<"TRADE", 0x02122>;
def : NCR<"reg", 0x000AE>;
def : NCR<"REG", 0x000AE>;
def : NCR<"lt", 0x0003C>;
def : NCR<"Lt", 0x0003C>;
def : NCR<"LT", 0x0003C>;
def : NCR<"gt", 0x0003E>;
def : NCR<"Gt", 0x0003E>;
def : NCR<"GT", 0x0003E>;
def : NCR<"amp", 0x00026>;
def : NCR<"AMP", 0x00026>;
def : NCR<"apos", 0x00027>;
def : NCR<"quot", 0x00022>;
def : NCR<"QUOT", 0x00022>;
def : NCR<"lsquo", 0x02018>;
def : NCR<"rsquo", 0x02019>;
def : NCR<"ldquo", 0x0201C>;
def : NCR<"rdquo", 0x0201D>;
def : NCR<"ndash", 0x02013>;
def : NCR<"mdash", 0x02014>;
def : NCR<"Auml", 0x000C4>;
def : NCR<"Euml", 0x000CB>;
def : NCR<"Iuml", 0x000CF>;
def : NCR<"Ouml", 0x000D6>;
def : NCR<"Uuml", 0x000DC>;
def : NCR<"Yuml", 0x00178>;
def : NCR<"auml", 0x000E4>;
def : NCR<"euml", 0x000EB>;
def : NCR<"iuml", 0x000EF>;
def : NCR<"ouml", 0x000F6>;
def : NCR<"uuml", 0x000FC>;
def : NCR<"yuml", 0x000FF>;
def : NCR<"Aacute", 0x000C1>;
def : NCR<"Eacute", 0x000C9>;
def : NCR<"Iacute", 0x000CD>;
def : NCR<"Oacute", 0x000D3>;
def : NCR<"Uacute", 0x000DA>;
def : NCR<"Yacute", 0x000DD>;
def : NCR<"aacute", 0x000E1>;
def : NCR<"eacute", 0x000E9>;
def : NCR<"iacute", 0x000ED>;
def : NCR<"oacute", 0x000F3>;
def : NCR<"uacute", 0x000FA>;
def : NCR<"yacute", 0x000FD>;
def : NCR<"Agrave", 0x000C0>;
def : NCR<"Egrave", 0x000C8>;
def : NCR<"Igrave", 0x000CC>;
def : NCR<"Ograve", 0x000D2>;
def : NCR<"Ugrave", 0x000D9>;
// def : NCR<"Ygrave", 0x01EF2>; // Defined neither in Doxygen, nor in HTML5.
def : NCR<"agrave", 0x000E0>;
def : NCR<"egrave", 0x000E8>;
def : NCR<"igrave", 0x000EC>;
def : NCR<"ograve", 0x000F2>;
def : NCR<"ugrave", 0x000F9>;
def : NCR<"ygrave", 0x01EF3>; // Defined in Doxygen, not defined in HTML5.
def : NCR<"Acirc", 0x000C2>;
def : NCR<"Ecirc", 0x000CA>;
def : NCR<"Icirc", 0x000CE>;
def : NCR<"Ocirc", 0x000D4>;
def : NCR<"Ucirc", 0x000DB>;
def : NCR<"Ycirc", 0x00176>; // Not defined in Doxygen, defined in HTML5.
def : NCR<"acirc", 0x000E2>;
def : NCR<"ecirc", 0x000EA>;
def : NCR<"icirc", 0x000EE>;
def : NCR<"ocirc", 0x000F4>;
def : NCR<"ucirc", 0x000FB>;
def : NCR<"ycirc", 0x00177>;
def : NCR<"Atilde", 0x000C3>;
def : NCR<"Ntilde", 0x000D1>;
def : NCR<"Otilde", 0x000D5>;
def : NCR<"atilde", 0x000E3>;
def : NCR<"ntilde", 0x000F1>;
def : NCR<"otilde", 0x000F5>;
def : NCR<"szlig", 0x000DF>;
def : NCR<"ccedil", 0x000E7>;
def : NCR<"Ccedil", 0x000C7>;
def : NCR<"aring", 0x000E5>;
def : NCR<"Aring", 0x000C5>;
def : NCR<"nbsp", 0x000A0>;
def : NCR<"Gamma", 0x00393>;
def : NCR<"Delta", 0x00394>;
def : NCR<"Theta", 0x00398>;
def : NCR<"Lambda", 0x0039B>;
def : NCR<"Xi", 0x0039E>;
def : NCR<"Pi", 0x003A0>;
def : NCR<"Sigma", 0x003A3>;
def : NCR<"Upsilon", 0x003A5>;
def : NCR<"Phi", 0x003A6>;
def : NCR<"Psi", 0x003A8>;
def : NCR<"Omega", 0x003A9>;
def : NCR<"alpha", 0x003B1>;
def : NCR<"beta", 0x003B2>;
def : NCR<"gamma", 0x003B3>;
def : NCR<"delta", 0x003B4>;
def : NCR<"epsilon", 0x003B5>;
def : NCR<"zeta", 0x003B6>;
def : NCR<"eta", 0x003B7>;
def : NCR<"theta", 0x003B8>;
def : NCR<"iota", 0x003B9>;
def : NCR<"kappa", 0x003BA>;
def : NCR<"lambda", 0x003BB>;
def : NCR<"mu", 0x003BC>;
def : NCR<"nu", 0x003BD>;
def : NCR<"xi", 0x003BE>;
def : NCR<"pi", 0x003C0>;
def : NCR<"rho", 0x003C1>;
def : NCR<"sigma", 0x003C3>;
def : NCR<"tau", 0x003C4>;
def : NCR<"upsilon", 0x003C5>;
def : NCR<"phi", 0x003C6>;
def : NCR<"chi", 0x003C7>;
def : NCR<"psi", 0x003C8>;
def : NCR<"omega", 0x003C9>;
def : NCR<"sigmaf", 0x003C2>;
def : NCR<"sect", 0x000A7>;
def : NCR<"deg", 0x000B0>;
def : NCR<"prime", 0x02032>;
def : NCR<"Prime", 0x02033>;
def : NCR<"infin", 0x0221E>;
def : NCR<"empty", 0x02205>;
def : NCR<"plusmn", 0x000B1>;
def : NCR<"times", 0x000D7>;
def : NCR<"minus", 0x02212>;
def : NCR<"sdot", 0x022C5>;
def : NCR<"part", 0x02202>;
def : NCR<"nabla", 0x02207>;
def : NCR<"radic", 0x0221A>;
def : NCR<"perp", 0x022A5>;
def : NCR<"sum", 0x02211>;
def : NCR<"int", 0x0222B>;
def : NCR<"prod", 0x0220F>;
def : NCR<"sim", 0x0223C>;
def : NCR<"asymp", 0x02248>;
def : NCR<"ne", 0x02260>;
def : NCR<"equiv", 0x02261>;
def : NCR<"prop", 0x0221D>;
def : NCR<"le", 0x02264>;
def : NCR<"ge", 0x02265>;
def : NCR<"larr", 0x02190>;
def : NCR<"rarr", 0x02192>;
def : NCR<"isin", 0x02208>;
def : NCR<"notin", 0x02209>;
def : NCR<"lceil", 0x02308>;
def : NCR<"rceil", 0x02309>;
def : NCR<"lfloor", 0x0230A>;
def : NCR<"rfloor", 0x0230B>;

View File

@ -282,18 +282,11 @@ private:
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
/// Given a Doxygen-supported named character reference (e.g., "&trade;"),
/// it returns its UTF8 encoding.
StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-10 integer, return the character.
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character.
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
/// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {

View File

@ -3,7 +3,9 @@ TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
BUILT_SOURCES = Attrs.inc AttrImpl.inc AttrDump.inc \
StmtNodes.inc DeclNodes.inc \
CommentNodes.inc CommentHTMLTags.inc \
CommentHTMLTagsProperties.inc CommentCommandInfo.inc
CommentHTMLTagsProperties.inc \
CommentHTMLNamedCharacterReferences.inc \
CommentCommandInfo.inc
TABLEGEN_INC_FILES_COMMON = 1
@ -52,6 +54,12 @@ $(ObjDir)/CommentHTMLTagsProperties.inc.tmp : $(PROJ_SRC_DIR)/CommentHTMLTags.td
$(Echo) "Building Clang comment HTML tag properties with tblgen"
$(Verb) $(ClangTableGen) -gen-clang-comment-html-tags-properties -o $(call SYSPATH, $@) $<
$(ObjDir)/CommentHTMLNamedCharacterReferences.inc.tmp : \
$(PROJ_SRC_DIR)/CommentHTMLNamedCharacterReferences.td \
$(CLANG_TBLGEN) $(ObjDir)/.dir
$(Echo) "Building Clang named character reference translation function with tblgen"
$(Verb) $(ClangTableGen) -gen-clang-comment-html-named-character-references -o $(call SYSPATH, $@) $<
$(ObjDir)/CommentCommandInfo.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \
$(CLANG_TBLGEN) $(ObjDir)/.dir
$(Echo) "Building Clang comment command info with tblgen"

View File

@ -68,6 +68,7 @@ add_dependencies(clangAST
ClangCommentNodes
ClangCommentHTMLTags
ClangCommentHTMLTagsProperties
ClangCommentHTMLNamedCharacterReferences
ClangDeclNodes
ClangDiagnosticAST
ClangDiagnosticComment

View File

@ -30,22 +30,8 @@ bool isHTMLHexCharacterReferenceCharacter(char C) {
(C >= 'A' && C <= 'F');
}
#include "clang/AST/CommentHTMLTags.inc"
} // unnamed namespace
static unsigned getCodePoint(StringRef Name) {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
CodePoint *= 16;
const char C = Name[i];
assert(isHTMLHexCharacterReferenceCharacter(C));
CodePoint += llvm::hexDigitValue(C);
}
return CodePoint;
}
StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator,
unsigned CodePoint) {
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
@ -53,164 +39,22 @@ StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) cons
else
return StringRef();
}
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
unsigned CodePoint = getCodePoint(Name);
return helperResolveHTMLHexCharacterReference(CodePoint);
}
#include "clang/AST/CommentHTMLTags.inc"
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
} // unnamed namespace
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
// Fast path, first check a few most widely used named character references.
return llvm::StringSwitch<StringRef>(Name)
.Case("amp", "&")
.Case("lt", "<")
.Case("gt", ">")
.Case("quot", "\"")
.Case("apos", "\'")
.Default("");
}
StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
return llvm::StringSwitch<StringRef>(Name)
.Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
.Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
.Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
.Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
.Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
.Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
.Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
.Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
.Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
.Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
.Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
.Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
.Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
.Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
.Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
.Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
.Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
.Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
.Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
.Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
.Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
.Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
.Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
.Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
.Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
.Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
.Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
.Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
.Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
.Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
.Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
.Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
.Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
.Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
.Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
.Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
.Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
.Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
.Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
.Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
.Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
.Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
.Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
.Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
.Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
.Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
.Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
.Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
.Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
.Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
.Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
.Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
.Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
.Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
.Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
.Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
.Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
.Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
.Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
.Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
.Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
.Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
.Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
.Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
.Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
.Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
.Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
.Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
.Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
.Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
.Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
.Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
.Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
.Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
.Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
.Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
.Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
.Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
.Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
.Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
.Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
.Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
.Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
.Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
.Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
.Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
.Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
.Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
.Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
.Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
.Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
.Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
.Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
.Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
.Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
.Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
.Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
.Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
.Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
.Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
.Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
.Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
.Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
.Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
.Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
.Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
.Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
.Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
.Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
.Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
.Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
.Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
.Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
.Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
.Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
.Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
.Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
.Case("part", helperResolveHTMLHexCharacterReference(0x02202))
.Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
.Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
.Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
.Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
.Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
.Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
.Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
.Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
.Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
.Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
.Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
.Case("le", helperResolveHTMLHexCharacterReference(0x02264))
.Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
.Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
.Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
.Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
.Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
.Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
.Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
.Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
.Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
.Default("");
// Slow path.
.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
}
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
@ -220,13 +64,18 @@ StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
CodePoint *= 10;
CodePoint += Name[i] - '0';
}
return convertCodePointToUTF8(Allocator, CodePoint);
}
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
return StringRef(Resolved, ResolvedPtr - Resolved);
else
return StringRef();
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
CodePoint *= 16;
const char C = Name[i];
assert(isHTMLHexCharacterReferenceCharacter(C));
CodePoint += llvm::hexDigitValue(C);
}
return convertCodePointToUTF8(Allocator, CodePoint);
}
void Lexer::skipLineStartingDecorations() {
@ -725,17 +574,8 @@ void Lexer::lexHTMLCharacterReference(Token &T) {
StringRef Name(NamePtr, TokenPtr - NamePtr);
TokenPtr++; // Skip semicolon.
StringRef Resolved;
if (isNamed) {
if (isNamed)
Resolved = resolveHTMLNamedCharacterReference(Name);
if (Resolved.empty()) {
Resolved = HTMLDoxygenCharacterReference(Name);
if (!Resolved.empty()) {
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(Resolved);
return;
}
}
}
else if (isDecimal)
Resolved = resolveHTMLDecimalCharacterReference(Name);
else

View File

@ -4,6 +4,7 @@ add_tablegen(clang-tblgen CLANG
ClangASTNodesEmitter.cpp
ClangAttrEmitter.cpp
ClangCommentCommandInfoEmitter.cpp
ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
ClangCommentHTMLTagsEmitter.cpp
ClangDiagnosticsEmitter.cpp
ClangSACheckersEmitter.cpp

View File

@ -0,0 +1,83 @@
//===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This tablegen backend emits an fficient function to translate HTML named
// character references to UTF-8 sequences.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/TableGen/Error.h"
#include "llvm/TableGen/Record.h"
#include "llvm/TableGen/StringMatcher.h"
#include <vector>
using namespace llvm;
/// \brief Convert a code point to the corresponding UTF-8 sequence represented
/// as a C string literal.
///
/// \returns true on success.
static bool translateCodePointToUTF8(unsigned CodePoint,
SmallVectorImpl<char> &CLiteral) {
char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
char *TranslatedPtr = Translated;
if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr))
return false;
StringRef UTF8(Translated, TranslatedPtr - Translated);
raw_svector_ostream OS(CLiteral);
OS << "\"";
for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
OS << "\\x";
OS.write_hex(static_cast<unsigned char>(UTF8[i]));
}
OS << "\"";
return true;
}
namespace clang {
void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
raw_ostream &OS) {
std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
std::vector<StringMatcher::StringPair> NameToUTF8;
SmallString<32> CLiteral;
for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
I != E; ++I) {
Record &Tag = **I;
std::string Spelling = Tag.getValueAsString("Spelling");
uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
CLiteral.clear();
CLiteral.append("return ");
if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
SrcMgr.PrintMessage(Tag.getLoc().front(),
SourceMgr::DK_Error,
Twine("invalid code point"));
continue;
}
CLiteral.append(";");
StringMatcher::StringPair Match(Spelling, CLiteral.str());
NameToUTF8.push_back(Match);
}
OS << "// This file is generated by TableGen. Do not edit.\n\n";
OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n"
" StringRef Name) {\n";
StringMatcher("Name", NameToUTF8, OS).Emit();
OS << " return StringRef();\n"
<< "}\n\n";
}
} // end namespace clang

View File

@ -44,6 +44,7 @@ enum ActionType {
GenClangSACheckers,
GenClangCommentHTMLTags,
GenClangCommentHTMLTagsProperties,
GenClangCommentHTMLNamedCharacterReferences,
GenClangCommentCommandInfo,
GenOptParserDefs, GenOptParserImpl,
GenArmNeon,
@ -111,6 +112,10 @@ namespace {
"gen-clang-comment-html-tags-properties",
"Generate efficient matchers for HTML tag "
"properties"),
clEnumValN(GenClangCommentHTMLNamedCharacterReferences,
"gen-clang-comment-html-named-character-references",
"Generate function to translate named character "
"references to UTF-8 sequences"),
clEnumValN(GenClangCommentCommandInfo,
"gen-clang-comment-command-info",
"Generate list of commands that are used in "
@ -194,6 +199,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
case GenClangCommentHTMLTagsProperties:
EmitClangCommentHTMLTagsProperties(Records, OS);
break;
case GenClangCommentHTMLNamedCharacterReferences:
EmitClangCommentHTMLNamedCharacterReferences(Records, OS);
break;
case GenClangCommentCommandInfo:
EmitClangCommentCommandInfo(Records, OS);
break;

View File

@ -51,6 +51,7 @@ void EmitClangSACheckers(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, raw_ostream &OS);
void EmitClangCommentCommandInfo(RecordKeeper &Records, raw_ostream &OS);