From 28800da1b352a43175aaa8d224d7fe6895b014be Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 30 Jan 2013 14:29:28 +0000 Subject: [PATCH] Comment parsing: resolve more named character references This reimplements r173850 with a better approach: (1) use a TableGen-generated matcher instead of doing a linear search; (2) avoid allocations for new strings by converting code points to string iterals with TableGen. llvm-svn: 173931 --- clang/include/clang/AST/CMakeLists.txt | 4 + .../CommentHTMLNamedCharacterReferences.td | 177 +++++++++++++++ clang/include/clang/AST/CommentLexer.h | 7 - clang/include/clang/AST/Makefile | 10 +- clang/lib/AST/CMakeLists.txt | 1 + clang/lib/AST/CommentLexer.cpp | 204 ++---------------- clang/utils/TableGen/CMakeLists.txt | 1 + ...mentHTMLNamedCharacterReferenceEmitter.cpp | 83 +++++++ clang/utils/TableGen/TableGen.cpp | 8 + clang/utils/TableGen/TableGenBackends.h | 1 + 10 files changed, 306 insertions(+), 190 deletions(-) create mode 100644 clang/include/clang/AST/CommentHTMLNamedCharacterReferences.td create mode 100644 clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp diff --git a/clang/include/clang/AST/CMakeLists.txt b/clang/include/clang/AST/CMakeLists.txt index 547124c95e13..61ba275ce675 100644 --- a/clang/include/clang/AST/CMakeLists.txt +++ b/clang/include/clang/AST/CMakeLists.txt @@ -33,6 +33,10 @@ clang_tablegen(CommentHTMLTagsProperties.inc -gen-clang-comment-html-tags-proper SOURCE CommentHTMLTags.td TARGET ClangCommentHTMLTagsProperties) +clang_tablegen(CommentHTMLNamedCharacterReferences.inc -gen-clang-comment-html-named-character-references + SOURCE CommentHTMLNamedCharacterReferences.td + TARGET ClangCommentHTMLNamedCharacterReferences) + clang_tablegen(CommentCommandInfo.inc -gen-clang-comment-command-info SOURCE CommentCommands.td TARGET ClangCommentCommandInfo) diff --git a/clang/include/clang/AST/CommentHTMLNamedCharacterReferences.td b/clang/include/clang/AST/CommentHTMLNamedCharacterReferences.td new file mode 100644 index 000000000000..449310871229 --- /dev/null +++ b/clang/include/clang/AST/CommentHTMLNamedCharacterReferences.td @@ -0,0 +1,177 @@ +// HTML Named Character Reference +class NCR { + string Spelling = spelling; + int CodePoint = codePoint; +} + +// The list below includes named character references supported by Doxygen: +// http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html +// +// It does not include all HTML 5 named character references. +// +// Corresponding code point values can be found here: +// http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html + +def : NCR<"copy", 0x000A9>; +def : NCR<"COPY", 0x000A9>; +def : NCR<"trade", 0x02122>; +def : NCR<"TRADE", 0x02122>; +def : NCR<"reg", 0x000AE>; +def : NCR<"REG", 0x000AE>; +def : NCR<"lt", 0x0003C>; +def : NCR<"Lt", 0x0003C>; +def : NCR<"LT", 0x0003C>; +def : NCR<"gt", 0x0003E>; +def : NCR<"Gt", 0x0003E>; +def : NCR<"GT", 0x0003E>; +def : NCR<"amp", 0x00026>; +def : NCR<"AMP", 0x00026>; +def : NCR<"apos", 0x00027>; +def : NCR<"quot", 0x00022>; +def : NCR<"QUOT", 0x00022>; +def : NCR<"lsquo", 0x02018>; +def : NCR<"rsquo", 0x02019>; +def : NCR<"ldquo", 0x0201C>; +def : NCR<"rdquo", 0x0201D>; +def : NCR<"ndash", 0x02013>; +def : NCR<"mdash", 0x02014>; + +def : NCR<"Auml", 0x000C4>; +def : NCR<"Euml", 0x000CB>; +def : NCR<"Iuml", 0x000CF>; +def : NCR<"Ouml", 0x000D6>; +def : NCR<"Uuml", 0x000DC>; +def : NCR<"Yuml", 0x00178>; +def : NCR<"auml", 0x000E4>; +def : NCR<"euml", 0x000EB>; +def : NCR<"iuml", 0x000EF>; +def : NCR<"ouml", 0x000F6>; +def : NCR<"uuml", 0x000FC>; +def : NCR<"yuml", 0x000FF>; + +def : NCR<"Aacute", 0x000C1>; +def : NCR<"Eacute", 0x000C9>; +def : NCR<"Iacute", 0x000CD>; +def : NCR<"Oacute", 0x000D3>; +def : NCR<"Uacute", 0x000DA>; +def : NCR<"Yacute", 0x000DD>; +def : NCR<"aacute", 0x000E1>; +def : NCR<"eacute", 0x000E9>; +def : NCR<"iacute", 0x000ED>; +def : NCR<"oacute", 0x000F3>; +def : NCR<"uacute", 0x000FA>; +def : NCR<"yacute", 0x000FD>; + +def : NCR<"Agrave", 0x000C0>; +def : NCR<"Egrave", 0x000C8>; +def : NCR<"Igrave", 0x000CC>; +def : NCR<"Ograve", 0x000D2>; +def : NCR<"Ugrave", 0x000D9>; +// def : NCR<"Ygrave", 0x01EF2>; // Defined neither in Doxygen, nor in HTML5. +def : NCR<"agrave", 0x000E0>; +def : NCR<"egrave", 0x000E8>; +def : NCR<"igrave", 0x000EC>; +def : NCR<"ograve", 0x000F2>; +def : NCR<"ugrave", 0x000F9>; +def : NCR<"ygrave", 0x01EF3>; // Defined in Doxygen, not defined in HTML5. + +def : NCR<"Acirc", 0x000C2>; +def : NCR<"Ecirc", 0x000CA>; +def : NCR<"Icirc", 0x000CE>; +def : NCR<"Ocirc", 0x000D4>; +def : NCR<"Ucirc", 0x000DB>; +def : NCR<"Ycirc", 0x00176>; // Not defined in Doxygen, defined in HTML5. +def : NCR<"acirc", 0x000E2>; +def : NCR<"ecirc", 0x000EA>; +def : NCR<"icirc", 0x000EE>; +def : NCR<"ocirc", 0x000F4>; +def : NCR<"ucirc", 0x000FB>; +def : NCR<"ycirc", 0x00177>; + +def : NCR<"Atilde", 0x000C3>; +def : NCR<"Ntilde", 0x000D1>; +def : NCR<"Otilde", 0x000D5>; +def : NCR<"atilde", 0x000E3>; +def : NCR<"ntilde", 0x000F1>; +def : NCR<"otilde", 0x000F5>; + +def : NCR<"szlig", 0x000DF>; + +def : NCR<"ccedil", 0x000E7>; +def : NCR<"Ccedil", 0x000C7>; + +def : NCR<"aring", 0x000E5>; +def : NCR<"Aring", 0x000C5>; + +def : NCR<"nbsp", 0x000A0>; + +def : NCR<"Gamma", 0x00393>; +def : NCR<"Delta", 0x00394>; +def : NCR<"Theta", 0x00398>; +def : NCR<"Lambda", 0x0039B>; +def : NCR<"Xi", 0x0039E>; +def : NCR<"Pi", 0x003A0>; +def : NCR<"Sigma", 0x003A3>; +def : NCR<"Upsilon", 0x003A5>; +def : NCR<"Phi", 0x003A6>; +def : NCR<"Psi", 0x003A8>; +def : NCR<"Omega", 0x003A9>; + +def : NCR<"alpha", 0x003B1>; +def : NCR<"beta", 0x003B2>; +def : NCR<"gamma", 0x003B3>; +def : NCR<"delta", 0x003B4>; +def : NCR<"epsilon", 0x003B5>; +def : NCR<"zeta", 0x003B6>; +def : NCR<"eta", 0x003B7>; +def : NCR<"theta", 0x003B8>; +def : NCR<"iota", 0x003B9>; +def : NCR<"kappa", 0x003BA>; +def : NCR<"lambda", 0x003BB>; +def : NCR<"mu", 0x003BC>; +def : NCR<"nu", 0x003BD>; +def : NCR<"xi", 0x003BE>; +def : NCR<"pi", 0x003C0>; +def : NCR<"rho", 0x003C1>; +def : NCR<"sigma", 0x003C3>; +def : NCR<"tau", 0x003C4>; +def : NCR<"upsilon", 0x003C5>; +def : NCR<"phi", 0x003C6>; +def : NCR<"chi", 0x003C7>; +def : NCR<"psi", 0x003C8>; +def : NCR<"omega", 0x003C9>; +def : NCR<"sigmaf", 0x003C2>; + +def : NCR<"sect", 0x000A7>; +def : NCR<"deg", 0x000B0>; +def : NCR<"prime", 0x02032>; +def : NCR<"Prime", 0x02033>; +def : NCR<"infin", 0x0221E>; +def : NCR<"empty", 0x02205>; +def : NCR<"plusmn", 0x000B1>; +def : NCR<"times", 0x000D7>; +def : NCR<"minus", 0x02212>; +def : NCR<"sdot", 0x022C5>; +def : NCR<"part", 0x02202>; +def : NCR<"nabla", 0x02207>; +def : NCR<"radic", 0x0221A>; +def : NCR<"perp", 0x022A5>; +def : NCR<"sum", 0x02211>; +def : NCR<"int", 0x0222B>; +def : NCR<"prod", 0x0220F>; +def : NCR<"sim", 0x0223C>; +def : NCR<"asymp", 0x02248>; +def : NCR<"ne", 0x02260>; +def : NCR<"equiv", 0x02261>; +def : NCR<"prop", 0x0221D>; +def : NCR<"le", 0x02264>; +def : NCR<"ge", 0x02265>; +def : NCR<"larr", 0x02190>; +def : NCR<"rarr", 0x02192>; +def : NCR<"isin", 0x02208>; +def : NCR<"notin", 0x02209>; +def : NCR<"lceil", 0x02308>; +def : NCR<"rceil", 0x02309>; +def : NCR<"lfloor", 0x0230A>; +def : NCR<"rfloor", 0x0230B>; + diff --git a/clang/include/clang/AST/CommentLexer.h b/clang/include/clang/AST/CommentLexer.h index 6ce084bc354c..b90414ba0102 100644 --- a/clang/include/clang/AST/CommentLexer.h +++ b/clang/include/clang/AST/CommentLexer.h @@ -282,18 +282,11 @@ private: /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; - /// Given a Doxygen-supported named character reference (e.g., "™"), - /// it returns its UTF8 encoding. - StringRef HTMLDoxygenCharacterReference(StringRef Name) const; - /// Given a Unicode codepoint as base-10 integer, return the character. StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; /// Given a Unicode codepoint as base-16 integer, return the character. StringRef resolveHTMLHexCharacterReference(StringRef Name) const; - - /// Helper routine to do part of the work for resolveHTMLHexCharacterReference. - StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const; void formTokenWithChars(Token &Result, const char *TokEnd, tok::TokenKind Kind) { diff --git a/clang/include/clang/AST/Makefile b/clang/include/clang/AST/Makefile index 61a0b64ce9f8..143339d1858e 100644 --- a/clang/include/clang/AST/Makefile +++ b/clang/include/clang/AST/Makefile @@ -3,7 +3,9 @@ TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic BUILT_SOURCES = Attrs.inc AttrImpl.inc AttrDump.inc \ StmtNodes.inc DeclNodes.inc \ CommentNodes.inc CommentHTMLTags.inc \ - CommentHTMLTagsProperties.inc CommentCommandInfo.inc + CommentHTMLTagsProperties.inc \ + CommentHTMLNamedCharacterReferences.inc \ + CommentCommandInfo.inc TABLEGEN_INC_FILES_COMMON = 1 @@ -52,6 +54,12 @@ $(ObjDir)/CommentHTMLTagsProperties.inc.tmp : $(PROJ_SRC_DIR)/CommentHTMLTags.td $(Echo) "Building Clang comment HTML tag properties with tblgen" $(Verb) $(ClangTableGen) -gen-clang-comment-html-tags-properties -o $(call SYSPATH, $@) $< +$(ObjDir)/CommentHTMLNamedCharacterReferences.inc.tmp : \ + $(PROJ_SRC_DIR)/CommentHTMLNamedCharacterReferences.td \ + $(CLANG_TBLGEN) $(ObjDir)/.dir + $(Echo) "Building Clang named character reference translation function with tblgen" + $(Verb) $(ClangTableGen) -gen-clang-comment-html-named-character-references -o $(call SYSPATH, $@) $< + $(ObjDir)/CommentCommandInfo.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \ $(CLANG_TBLGEN) $(ObjDir)/.dir $(Echo) "Building Clang comment command info with tblgen" diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 923c519d4918..328f38515314 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -68,6 +68,7 @@ add_dependencies(clangAST ClangCommentNodes ClangCommentHTMLTags ClangCommentHTMLTagsProperties + ClangCommentHTMLNamedCharacterReferences ClangDeclNodes ClangDiagnosticAST ClangDiagnosticComment diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp index b92b1fb33dc5..e4441c13f75f 100644 --- a/clang/lib/AST/CommentLexer.cpp +++ b/clang/lib/AST/CommentLexer.cpp @@ -30,22 +30,8 @@ bool isHTMLHexCharacterReferenceCharacter(char C) { (C >= 'A' && C <= 'F'); } -#include "clang/AST/CommentHTMLTags.inc" - -} // unnamed namespace - -static unsigned getCodePoint(StringRef Name) { - unsigned CodePoint = 0; - for (unsigned i = 0, e = Name.size(); i != e; ++i) { - CodePoint *= 16; - const char C = Name[i]; - assert(isHTMLHexCharacterReferenceCharacter(C)); - CodePoint += llvm::hexDigitValue(C); - } - return CodePoint; -} - -StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const { +StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, + unsigned CodePoint) { char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); char *ResolvedPtr = Resolved; if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) @@ -53,164 +39,22 @@ StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) cons else return StringRef(); } - -StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { - unsigned CodePoint = getCodePoint(Name); - return helperResolveHTMLHexCharacterReference(CodePoint); -} + +#include "clang/AST/CommentHTMLTags.inc" +#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" + +} // unnamed namespace StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { + // Fast path, first check a few most widely used named character references. return llvm::StringSwitch(Name) .Case("amp", "&") .Case("lt", "<") .Case("gt", ">") .Case("quot", "\"") .Case("apos", "\'") - .Default(""); -} - -StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const { - return llvm::StringSwitch(Name) - .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9)) - .Case("trade", helperResolveHTMLHexCharacterReference(0x02122)) - .Case("reg", helperResolveHTMLHexCharacterReference(0x000AE)) - .Case("lt", helperResolveHTMLHexCharacterReference(0x0003C)) - .Case("gt", helperResolveHTMLHexCharacterReference(0x0003C)) - .Case("amp", helperResolveHTMLHexCharacterReference(0x00026)) - .Case("apos", helperResolveHTMLHexCharacterReference(0x00027)) - .Case("quot", helperResolveHTMLHexCharacterReference(0x00022)) - .Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018)) - .Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019)) - .Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C)) - .Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D)) - .Case("ndash", helperResolveHTMLHexCharacterReference(0x02013)) - .Case("mdash", helperResolveHTMLHexCharacterReference(0x02014)) - .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4)) - .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB)) - .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF)) - .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6)) - .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC)) - .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178)) - .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4)) - .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB)) - .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF)) - .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6)) - .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC)) - .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF)) - .Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1)) - .Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9)) - .Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD)) - .Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3)) - .Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA)) - .Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD)) - .Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1)) - .Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9)) - .Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED)) - .Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3)) - .Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA)) - .Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD)) - .Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0)) - .Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8)) - .Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC)) - .Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2)) - .Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9)) - .Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0)) - .Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8)) - .Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC)) - .Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2)) - .Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9)) - .Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3)) - .Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2)) - .Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA)) - .Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE)) - .Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4)) - .Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB)) - .Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2)) - .Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA)) - .Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE)) - .Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4)) - .Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB)) - .Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177)) - .Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3)) - .Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1)) - .Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5)) - .Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3)) - .Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1)) - .Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5)) - .Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF)) - .Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7)) - .Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7)) - .Case("aring", helperResolveHTMLHexCharacterReference(0x000E5)) - .Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5)) - .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0)) - .Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393)) - .Case("Delta", helperResolveHTMLHexCharacterReference(0x00394)) - .Case("Theta", helperResolveHTMLHexCharacterReference(0x00398)) - .Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B)) - .Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E)) - .Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0)) - .Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3)) - .Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5)) - .Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6)) - .Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8)) - .Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9)) - .Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1)) - .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2)) - .Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3)) - .Case("delta", helperResolveHTMLHexCharacterReference(0x003B4)) - .Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5)) - .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6)) - .Case("eta", helperResolveHTMLHexCharacterReference(0x003B7)) - .Case("theta", helperResolveHTMLHexCharacterReference(0x003B8)) - .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9)) - .Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA)) - .Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB)) - .Case("mu", helperResolveHTMLHexCharacterReference(0x003BC)) - .Case("nu", helperResolveHTMLHexCharacterReference(0x003BD)) - .Case("xi", helperResolveHTMLHexCharacterReference(0x003BE)) - .Case("pi", helperResolveHTMLHexCharacterReference(0x003C0)) - .Case("rho", helperResolveHTMLHexCharacterReference(0x003C1)) - .Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3)) - .Case("tau", helperResolveHTMLHexCharacterReference(0x003C4)) - .Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5)) - .Case("phi", helperResolveHTMLHexCharacterReference(0x003C6)) - .Case("chi", helperResolveHTMLHexCharacterReference(0x003C7)) - .Case("psi", helperResolveHTMLHexCharacterReference(0x003C8)) - .Case("omega", helperResolveHTMLHexCharacterReference(0x003C9)) - .Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2)) - .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7)) - .Case("deg", helperResolveHTMLHexCharacterReference(0x000B0)) - .Case("prime", helperResolveHTMLHexCharacterReference(0x02032)) - .Case("Prime", helperResolveHTMLHexCharacterReference(0x02033)) - .Case("infin", helperResolveHTMLHexCharacterReference(0x0221E)) - .Case("empty", helperResolveHTMLHexCharacterReference(0x02205)) - .Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1)) - .Case("times", helperResolveHTMLHexCharacterReference(0x000D7)) - .Case("minus", helperResolveHTMLHexCharacterReference(0x02212)) - .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5)) - .Case("part", helperResolveHTMLHexCharacterReference(0x02202)) - .Case("nabla", helperResolveHTMLHexCharacterReference(0x02207)) - .Case("radic", helperResolveHTMLHexCharacterReference(0x0221A)) - .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5)) - .Case("sum", helperResolveHTMLHexCharacterReference(0x02211)) - .Case("int", helperResolveHTMLHexCharacterReference(0x0222B)) - .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F)) - .Case("sim", helperResolveHTMLHexCharacterReference(0x0223C)) - .Case("asymp", helperResolveHTMLHexCharacterReference(0x02248)) - .Case("ne", helperResolveHTMLHexCharacterReference(0x02260)) - .Case("equiv", helperResolveHTMLHexCharacterReference(0x02261)) - .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D)) - .Case("le", helperResolveHTMLHexCharacterReference(0x02264)) - .Case("ge", helperResolveHTMLHexCharacterReference(0x02265)) - .Case("larr", helperResolveHTMLHexCharacterReference(0x02190)) - .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192)) - .Case("isin", helperResolveHTMLHexCharacterReference(0x02208)) - .Case("notin", helperResolveHTMLHexCharacterReference(0x02209)) - .Case("lceil", helperResolveHTMLHexCharacterReference(0x02308)) - .Case("rceil", helperResolveHTMLHexCharacterReference(0x02309)) - .Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A)) - .Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B)) - .Default(""); + // Slow path. + .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); } StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { @@ -220,13 +64,18 @@ StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { CodePoint *= 10; CodePoint += Name[i] - '0'; } + return convertCodePointToUTF8(Allocator, CodePoint); +} - char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); - char *ResolvedPtr = Resolved; - if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) - return StringRef(Resolved, ResolvedPtr - Resolved); - else - return StringRef(); +StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + CodePoint *= 16; + const char C = Name[i]; + assert(isHTMLHexCharacterReferenceCharacter(C)); + CodePoint += llvm::hexDigitValue(C); + } + return convertCodePointToUTF8(Allocator, CodePoint); } void Lexer::skipLineStartingDecorations() { @@ -725,17 +574,8 @@ void Lexer::lexHTMLCharacterReference(Token &T) { StringRef Name(NamePtr, TokenPtr - NamePtr); TokenPtr++; // Skip semicolon. StringRef Resolved; - if (isNamed) { + if (isNamed) Resolved = resolveHTMLNamedCharacterReference(Name); - if (Resolved.empty()) { - Resolved = HTMLDoxygenCharacterReference(Name); - if (!Resolved.empty()) { - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Resolved); - return; - } - } - } else if (isDecimal) Resolved = resolveHTMLDecimalCharacterReference(Name); else diff --git a/clang/utils/TableGen/CMakeLists.txt b/clang/utils/TableGen/CMakeLists.txt index 534ac9af7760..a858a214b03d 100644 --- a/clang/utils/TableGen/CMakeLists.txt +++ b/clang/utils/TableGen/CMakeLists.txt @@ -4,6 +4,7 @@ add_tablegen(clang-tblgen CLANG ClangASTNodesEmitter.cpp ClangAttrEmitter.cpp ClangCommentCommandInfoEmitter.cpp + ClangCommentHTMLNamedCharacterReferenceEmitter.cpp ClangCommentHTMLTagsEmitter.cpp ClangDiagnosticsEmitter.cpp ClangSACheckersEmitter.cpp diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp new file mode 100644 index 000000000000..3afe2b73f0aa --- /dev/null +++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp @@ -0,0 +1,83 @@ +//===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This tablegen backend emits an fficient function to translate HTML named +// character references to UTF-8 sequences. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/StringMatcher.h" +#include + +using namespace llvm; + +/// \brief Convert a code point to the corresponding UTF-8 sequence represented +/// as a C string literal. +/// +/// \returns true on success. +static bool translateCodePointToUTF8(unsigned CodePoint, + SmallVectorImpl &CLiteral) { + char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; + char *TranslatedPtr = Translated; + if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr)) + return false; + + StringRef UTF8(Translated, TranslatedPtr - Translated); + + raw_svector_ostream OS(CLiteral); + OS << "\""; + for (size_t i = 0, e = UTF8.size(); i != e; ++i) { + OS << "\\x"; + OS.write_hex(static_cast(UTF8[i])); + } + OS << "\""; + + return true; +} + +namespace clang { +void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, + raw_ostream &OS) { + std::vector Tags = Records.getAllDerivedDefinitions("NCR"); + std::vector NameToUTF8; + SmallString<32> CLiteral; + for (std::vector::iterator I = Tags.begin(), E = Tags.end(); + I != E; ++I) { + Record &Tag = **I; + std::string Spelling = Tag.getValueAsString("Spelling"); + uint64_t CodePoint = Tag.getValueAsInt("CodePoint"); + CLiteral.clear(); + CLiteral.append("return "); + if (!translateCodePointToUTF8(CodePoint, CLiteral)) { + SrcMgr.PrintMessage(Tag.getLoc().front(), + SourceMgr::DK_Error, + Twine("invalid code point")); + continue; + } + CLiteral.append(";"); + + StringMatcher::StringPair Match(Spelling, CLiteral.str()); + NameToUTF8.push_back(Match); + } + + OS << "// This file is generated by TableGen. Do not edit.\n\n"; + + OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n" + " StringRef Name) {\n"; + StringMatcher("Name", NameToUTF8, OS).Emit(); + OS << " return StringRef();\n" + << "}\n\n"; +} + +} // end namespace clang + diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 8af6598cd070..4097339b9a0b 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -44,6 +44,7 @@ enum ActionType { GenClangSACheckers, GenClangCommentHTMLTags, GenClangCommentHTMLTagsProperties, + GenClangCommentHTMLNamedCharacterReferences, GenClangCommentCommandInfo, GenOptParserDefs, GenOptParserImpl, GenArmNeon, @@ -111,6 +112,10 @@ namespace { "gen-clang-comment-html-tags-properties", "Generate efficient matchers for HTML tag " "properties"), + clEnumValN(GenClangCommentHTMLNamedCharacterReferences, + "gen-clang-comment-html-named-character-references", + "Generate function to translate named character " + "references to UTF-8 sequences"), clEnumValN(GenClangCommentCommandInfo, "gen-clang-comment-command-info", "Generate list of commands that are used in " @@ -194,6 +199,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenClangCommentHTMLTagsProperties: EmitClangCommentHTMLTagsProperties(Records, OS); break; + case GenClangCommentHTMLNamedCharacterReferences: + EmitClangCommentHTMLNamedCharacterReferences(Records, OS); + break; case GenClangCommentCommandInfo: EmitClangCommentCommandInfo(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 637e54c01b26..3bc4c906c005 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -51,6 +51,7 @@ void EmitClangSACheckers(RecordKeeper &Records, raw_ostream &OS); void EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS); void EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, raw_ostream &OS); +void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, raw_ostream &OS); void EmitClangCommentCommandInfo(RecordKeeper &Records, raw_ostream &OS);