forked from OSchip/llvm-project
enhance sourcemgr to detect various UTF BOM's and emit a fatal error
about it instead of producing tons of garbage from the lexer. It would be even better for sourcemgr to dynamically transcode (e.g. from UTF16 -> UTF8). llvm-svn: 101924
This commit is contained in:
parent
91baecfeb3
commit
8fbe98b3b6
|
@ -72,5 +72,6 @@ def err_target_invalid_feature : Error<"invalid target feature '%0'">;
|
|||
def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal;
|
||||
def err_file_modified : Error<
|
||||
"file '%0' modified since it was first processed">, DefaultFatal;
|
||||
|
||||
def err_unsupported_bom : Error<"%0 byte order mark detected in '%1', but "
|
||||
"encoding is not supported">, DefaultFatal;
|
||||
}
|
||||
|
|
|
@ -119,6 +119,41 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag,
|
|||
Buffer.setInt(true);
|
||||
#endif
|
||||
}
|
||||
|
||||
// If the buffer is valid, check to see if it has a UTF Byte Order Mark
|
||||
// (BOM). We only support UTF-8 without a BOM right now. See
|
||||
// http://en.wikipedia.org/wiki/Byte_order_mark for more information.
|
||||
if (!Buffer.getInt()) {
|
||||
llvm::StringRef BufStr = Buffer.getPointer()->getBuffer();
|
||||
const char *BOM = 0;
|
||||
if (BufStr.startswith("\xFE\xBB\xBF"))
|
||||
BOM = "UTF-8";
|
||||
else if (BufStr.startswith("\xFE\xFF"))
|
||||
BOM = "UTF-16 (BE)";
|
||||
else if (BufStr.startswith("\xFF\xFE"))
|
||||
BOM = "UTF-16 (LE)";
|
||||
else if (BufStr.startswith(llvm::StringRef("\x00\x00\xFE\xFF", 4)))
|
||||
BOM = "UTF-32 (BE)";
|
||||
else if (BufStr.startswith(llvm::StringRef("\xFF\xFE\x00\x00", 4)))
|
||||
BOM = "UTF-32 (LE)";
|
||||
else if (BufStr.startswith("\x2B\x2F\x76"))
|
||||
BOM = "UTF-7";
|
||||
else if (BufStr.startswith("\xF7\x64\x4C"))
|
||||
BOM = "UTF-1";
|
||||
else if (BufStr.startswith("\xDD\x73\x66\x73"))
|
||||
BOM = "UTF-EBCDIC";
|
||||
else if (BufStr.startswith("\x0E\xFE\xFF"))
|
||||
BOM = "SDSU";
|
||||
else if (BufStr.startswith("\xFB\xEE\x28"))
|
||||
BOM = "BOCU-1";
|
||||
else if (BufStr.startswith("\x84\x31\x95\x33"))
|
||||
BOM = "BOCU-1";
|
||||
|
||||
if (BOM) {
|
||||
Diag.Report(diag::err_unsupported_bom) << BOM << Entry->getName();
|
||||
Buffer.setInt(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Invalid)
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
// RUN: not %clang -xc %s.txt -fsyntax-only 2>&1 | grep 'UTF-16 (LE) byte order mark detected'
|
||||
// rdar://7876588
|
||||
|
||||
// This test verifies that clang gives a decent error for UTF-16 source files.
|
Binary file not shown.
Loading…
Reference in New Issue