2007-04-05 14:57:15 +08:00
|
|
|
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
|
2007-03-10 07:16:33 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 03:59:25 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2007-03-10 07:16:33 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2007-04-05 14:57:15 +08:00
|
|
|
// This file implements the NumericLiteralParser, CharLiteralParser, and
|
|
|
|
// StringLiteralParser interfaces.
|
2007-03-10 07:16:33 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "clang/Lex/LiteralSupport.h"
|
2013-02-09 06:30:41 +08:00
|
|
|
#include "clang/Basic/CharInfo.h"
|
2012-12-04 17:13:33 +08:00
|
|
|
#include "clang/Basic/TargetInfo.h"
|
|
|
|
#include "clang/Lex/LexDiagnostic.h"
|
|
|
|
#include "clang/Lex/Preprocessor.h"
|
2007-03-14 06:37:02 +08:00
|
|
|
#include "llvm/ADT/StringExtras.h"
|
2013-01-30 20:06:08 +08:00
|
|
|
#include "llvm/Support/ConvertUTF.h"
|
2011-09-23 13:35:21 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2013-01-30 20:06:08 +08:00
|
|
|
|
2007-03-10 07:16:33 +08:00
|
|
|
using namespace clang;
|
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
|
|
|
|
switch (kind) {
|
2011-09-23 13:06:16 +08:00
|
|
|
default: llvm_unreachable("Unknown token type!");
|
2011-07-27 13:40:30 +08:00
|
|
|
case tok::char_constant:
|
|
|
|
case tok::string_literal:
|
2014-11-08 14:08:42 +08:00
|
|
|
case tok::utf8_char_constant:
|
2011-07-27 13:40:30 +08:00
|
|
|
case tok::utf8_string_literal:
|
|
|
|
return Target.getCharWidth();
|
|
|
|
case tok::wide_char_constant:
|
|
|
|
case tok::wide_string_literal:
|
|
|
|
return Target.getWCharWidth();
|
|
|
|
case tok::utf16_char_constant:
|
|
|
|
case tok::utf16_string_literal:
|
|
|
|
return Target.getChar16Width();
|
|
|
|
case tok::utf32_char_constant:
|
|
|
|
case tok::utf32_string_literal:
|
|
|
|
return Target.getChar32Width();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-29 02:24:46 +08:00
|
|
|
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
|
|
|
|
FullSourceLoc TokLoc,
|
|
|
|
const char *TokBegin,
|
|
|
|
const char *TokRangeBegin,
|
|
|
|
const char *TokRangeEnd) {
|
|
|
|
SourceLocation Begin =
|
|
|
|
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
|
|
|
|
TokLoc.getManager(), Features);
|
|
|
|
SourceLocation End =
|
|
|
|
Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
|
|
|
|
TokLoc.getManager(), Features);
|
|
|
|
return CharSourceRange::getCharRange(Begin, End);
|
|
|
|
}
|
|
|
|
|
2012-09-08 15:16:20 +08:00
|
|
|
/// \brief Produce a diagnostic highlighting some portion of a literal.
|
|
|
|
///
|
|
|
|
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
|
|
|
|
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
|
|
|
|
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
|
|
|
|
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
|
|
|
|
const LangOptions &Features, FullSourceLoc TokLoc,
|
|
|
|
const char *TokBegin, const char *TokRangeBegin,
|
|
|
|
const char *TokRangeEnd, unsigned DiagID) {
|
|
|
|
SourceLocation Begin =
|
|
|
|
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
|
|
|
|
TokLoc.getManager(), Features);
|
2012-10-29 02:24:46 +08:00
|
|
|
return Diags->Report(Begin, DiagID) <<
|
|
|
|
MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
|
2012-09-08 15:16:20 +08:00
|
|
|
}
|
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
|
|
|
|
/// either a character or a string literal.
|
2012-09-08 15:16:20 +08:00
|
|
|
static unsigned ProcessCharEscape(const char *ThisTokBegin,
|
|
|
|
const char *&ThisTokBuf,
|
2007-04-05 14:57:15 +08:00
|
|
|
const char *ThisTokEnd, bool &HadError,
|
2011-07-27 13:40:30 +08:00
|
|
|
FullSourceLoc Loc, unsigned CharWidth,
|
2012-09-08 15:16:20 +08:00
|
|
|
DiagnosticsEngine *Diags,
|
|
|
|
const LangOptions &Features) {
|
|
|
|
const char *EscapeBegin = ThisTokBuf;
|
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
// Skip the '\' char.
|
|
|
|
++ThisTokBuf;
|
|
|
|
|
|
|
|
// We know that this character can't be off the end of the buffer, because
|
|
|
|
// that would have been \", which would not have been the end of string.
|
|
|
|
unsigned ResultChar = *ThisTokBuf++;
|
|
|
|
switch (ResultChar) {
|
|
|
|
// These map to themselves.
|
|
|
|
case '\\': case '\'': case '"': case '?': break;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
// These have fixed mappings.
|
|
|
|
case 'a':
|
|
|
|
// TODO: K&R: the meaning of '\\a' is different in traditional C
|
|
|
|
ResultChar = 7;
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
ResultChar = 8;
|
|
|
|
break;
|
|
|
|
case 'e':
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
|
|
|
diag::ext_nonstandard_escape) << "e";
|
2007-04-05 14:57:15 +08:00
|
|
|
ResultChar = 27;
|
|
|
|
break;
|
2009-06-10 09:32:39 +08:00
|
|
|
case 'E':
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
|
|
|
diag::ext_nonstandard_escape) << "E";
|
2009-06-10 09:32:39 +08:00
|
|
|
ResultChar = 27;
|
|
|
|
break;
|
2007-04-05 14:57:15 +08:00
|
|
|
case 'f':
|
|
|
|
ResultChar = 12;
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
ResultChar = 10;
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
ResultChar = 13;
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
ResultChar = 9;
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
ResultChar = 11;
|
|
|
|
break;
|
2007-05-20 13:00:58 +08:00
|
|
|
case 'x': { // Hex escape.
|
|
|
|
ResultChar = 0;
|
2013-02-09 06:30:41 +08:00
|
|
|
if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
2013-01-25 04:50:13 +08:00
|
|
|
diag::err_hex_escape_no_digits) << "x";
|
2007-04-05 14:57:15 +08:00
|
|
|
HadError = 1;
|
|
|
|
break;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-05-20 13:17:04 +08:00
|
|
|
// Hex escapes are a maximal series of hex digits.
|
2007-05-20 13:00:58 +08:00
|
|
|
bool Overflow = false;
|
|
|
|
for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
|
2013-01-19 06:33:58 +08:00
|
|
|
int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
|
2007-05-20 13:00:58 +08:00
|
|
|
if (CharVal == -1) break;
|
2008-10-01 04:45:40 +08:00
|
|
|
// About to shift out a digit?
|
2015-03-24 03:54:44 +08:00
|
|
|
if (ResultChar & 0xF0000000)
|
|
|
|
Overflow = true;
|
2007-05-20 13:00:58 +08:00
|
|
|
ResultChar <<= 4;
|
|
|
|
ResultChar |= CharVal;
|
|
|
|
}
|
|
|
|
|
|
|
|
// See if any bits will be truncated when evaluated as a character.
|
|
|
|
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
|
|
|
|
Overflow = true;
|
|
|
|
ResultChar &= ~0U >> (32-CharWidth);
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-05-20 13:00:58 +08:00
|
|
|
// Check for overflow.
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Overflow && Diags) // Too many digits to fit in
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
2015-11-14 10:09:55 +08:00
|
|
|
diag::err_escape_too_large) << 0;
|
2007-04-05 14:57:15 +08:00
|
|
|
break;
|
2007-05-20 13:00:58 +08:00
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
case '0': case '1': case '2': case '3':
|
2007-05-20 13:17:04 +08:00
|
|
|
case '4': case '5': case '6': case '7': {
|
2007-04-05 14:57:15 +08:00
|
|
|
// Octal escapes.
|
2007-06-09 14:20:47 +08:00
|
|
|
--ThisTokBuf;
|
2007-05-20 13:17:04 +08:00
|
|
|
ResultChar = 0;
|
|
|
|
|
|
|
|
// Octal escapes are a series of octal digits with maximum length 3.
|
|
|
|
// "\0123" is a two digit sequence equal to "\012" "3".
|
|
|
|
unsigned NumDigits = 0;
|
|
|
|
do {
|
|
|
|
ResultChar <<= 3;
|
|
|
|
ResultChar |= *ThisTokBuf++ - '0';
|
|
|
|
++NumDigits;
|
|
|
|
} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
|
|
|
|
ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-05-20 13:17:04 +08:00
|
|
|
// Check for overflow. Reject '\777', but not L'\777'.
|
|
|
|
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
2015-11-14 10:09:55 +08:00
|
|
|
diag::err_escape_too_large) << 1;
|
2007-05-20 13:17:04 +08:00
|
|
|
ResultChar &= ~0U >> (32-CharWidth);
|
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
break;
|
2007-05-20 13:17:04 +08:00
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
// Otherwise, these are not valid escapes.
|
|
|
|
case '(': case '{': case '[': case '%':
|
|
|
|
// GCC accepts these as extensions. We warn about them as such though.
|
2010-11-17 14:26:08 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
|
|
|
diag::ext_nonstandard_escape)
|
|
|
|
<< std::string(1, ResultChar);
|
2009-04-28 08:51:18 +08:00
|
|
|
break;
|
2007-04-05 14:57:15 +08:00
|
|
|
default:
|
2014-05-18 07:10:59 +08:00
|
|
|
if (!Diags)
|
2010-05-26 13:35:51 +08:00
|
|
|
break;
|
2012-09-08 15:16:20 +08:00
|
|
|
|
2013-02-09 06:30:41 +08:00
|
|
|
if (isPrintable(ResultChar))
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
|
|
|
diag::ext_unknown_escape)
|
|
|
|
<< std::string(1, ResultChar);
|
2008-11-22 15:23:31 +08:00
|
|
|
else
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
|
|
|
diag::ext_unknown_escape)
|
|
|
|
<< "x" + llvm::utohexstr(ResultChar);
|
2007-04-05 14:57:15 +08:00
|
|
|
break;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
return ResultChar;
|
|
|
|
}
|
|
|
|
|
2014-02-18 05:52:30 +08:00
|
|
|
static void appendCodePoint(unsigned Codepoint,
|
|
|
|
llvm::SmallVectorImpl<char> &Str) {
|
|
|
|
char ResultBuf[4];
|
|
|
|
char *ResultPtr = ResultBuf;
|
|
|
|
bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
|
|
|
|
(void)Res;
|
|
|
|
assert(Res && "Unexpected conversion failure");
|
|
|
|
Str.append(ResultBuf, ResultPtr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
|
|
|
|
for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
|
|
|
|
if (*I != '\\') {
|
|
|
|
Buf.push_back(*I);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
++I;
|
|
|
|
assert(*I == 'u' || *I == 'U');
|
|
|
|
|
|
|
|
unsigned NumHexDigits;
|
|
|
|
if (*I == 'u')
|
|
|
|
NumHexDigits = 4;
|
|
|
|
else
|
|
|
|
NumHexDigits = 8;
|
|
|
|
|
|
|
|
assert(I + NumHexDigits <= E);
|
|
|
|
|
|
|
|
uint32_t CodePoint = 0;
|
|
|
|
for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
|
|
|
|
unsigned Value = llvm::hexDigitValue(*I);
|
|
|
|
assert(Value != -1U);
|
|
|
|
|
|
|
|
CodePoint <<= 4;
|
|
|
|
CodePoint += Value;
|
|
|
|
}
|
|
|
|
|
|
|
|
appendCodePoint(CodePoint, Buf);
|
|
|
|
--I;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-31 07:46:03 +08:00
|
|
|
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
|
2010-10-09 08:27:47 +08:00
|
|
|
/// return the UTF32.
|
2012-03-10 06:27:51 +08:00
|
|
|
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
|
|
|
const char *ThisTokEnd,
|
2010-10-09 08:27:47 +08:00
|
|
|
uint32_t &UcnVal, unsigned short &UcnLen,
|
2011-09-26 07:23:43 +08:00
|
|
|
FullSourceLoc Loc, DiagnosticsEngine *Diags,
|
2012-01-18 20:27:04 +08:00
|
|
|
const LangOptions &Features,
|
|
|
|
bool in_char_string_literal = false) {
|
2012-03-10 06:27:51 +08:00
|
|
|
const char *UcnBegin = ThisTokBuf;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-03-31 07:46:03 +08:00
|
|
|
// Skip the '\u' char's.
|
|
|
|
ThisTokBuf += 2;
|
|
|
|
|
2013-02-09 06:30:41 +08:00
|
|
|
if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
|
2010-11-17 14:46:14 +08:00
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
2013-01-25 04:50:13 +08:00
|
|
|
diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
|
2010-10-09 08:27:47 +08:00
|
|
|
return false;
|
2009-03-31 07:46:03 +08:00
|
|
|
}
|
2010-10-09 08:27:47 +08:00
|
|
|
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
|
2010-09-01 07:34:27 +08:00
|
|
|
unsigned short UcnLenSave = UcnLen;
|
2010-10-09 08:27:47 +08:00
|
|
|
for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
|
2013-01-19 06:33:58 +08:00
|
|
|
int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
|
2009-03-31 07:46:03 +08:00
|
|
|
if (CharVal == -1) break;
|
|
|
|
UcnVal <<= 4;
|
|
|
|
UcnVal |= CharVal;
|
|
|
|
}
|
|
|
|
// If we didn't consume the proper number of digits, there is a problem.
|
2010-10-09 08:27:47 +08:00
|
|
|
if (UcnLenSave) {
|
2012-09-08 15:16:20 +08:00
|
|
|
if (Diags)
|
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
|
|
|
diag::err_ucn_escape_incomplete);
|
2010-10-09 08:27:47 +08:00
|
|
|
return false;
|
2009-03-31 07:46:03 +08:00
|
|
|
}
|
2012-03-10 06:27:51 +08:00
|
|
|
|
2012-01-18 20:27:04 +08:00
|
|
|
// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
|
2012-03-10 06:27:51 +08:00
|
|
|
if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
|
|
|
|
UcnVal > 0x10FFFF) { // maximum legal UTF32 value
|
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
|
|
|
diag::err_ucn_escape_invalid);
|
2012-03-10 06:27:51 +08:00
|
|
|
return false;
|
|
|
|
}
|
2012-01-18 20:27:04 +08:00
|
|
|
|
|
|
|
// C++11 allows UCNs that refer to control characters and basic source
|
|
|
|
// characters inside character and string literals
|
2012-03-10 06:27:51 +08:00
|
|
|
if (UcnVal < 0xa0 &&
|
|
|
|
(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
|
2013-01-02 19:42:31 +08:00
|
|
|
bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
|
2012-03-10 06:27:51 +08:00
|
|
|
if (Diags) {
|
|
|
|
char BasicSCSChar = UcnVal;
|
|
|
|
if (UcnVal >= 0x20 && UcnVal < 0x7f)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
|
|
|
IsError ? diag::err_ucn_escape_basic_scs :
|
|
|
|
diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
|
|
|
|
<< StringRef(&BasicSCSChar, 1);
|
2012-03-10 06:27:51 +08:00
|
|
|
else
|
2012-09-08 15:16:20 +08:00
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
|
|
|
IsError ? diag::err_ucn_control_character :
|
|
|
|
diag::warn_cxx98_compat_literal_ucn_control_character);
|
2012-01-18 20:27:04 +08:00
|
|
|
}
|
2012-03-10 06:27:51 +08:00
|
|
|
if (IsError)
|
|
|
|
return false;
|
2012-01-18 20:27:04 +08:00
|
|
|
}
|
|
|
|
|
2012-09-08 15:16:20 +08:00
|
|
|
if (!Features.CPlusPlus && !Features.C99 && Diags)
|
|
|
|
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
2013-01-28 04:12:04 +08:00
|
|
|
diag::warn_ucn_not_valid_in_c89_literal);
|
2012-09-08 15:16:20 +08:00
|
|
|
|
2010-10-09 08:27:47 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-06-13 13:37:23 +08:00
|
|
|
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
|
|
|
|
/// which this UCN will occupy.
|
|
|
|
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
|
|
|
const char *ThisTokEnd, unsigned CharByteWidth,
|
|
|
|
const LangOptions &Features, bool &HadError) {
|
|
|
|
// UTF-32: 4 bytes per escape.
|
|
|
|
if (CharByteWidth == 4)
|
|
|
|
return 4;
|
|
|
|
|
|
|
|
uint32_t UcnVal = 0;
|
|
|
|
unsigned short UcnLen = 0;
|
|
|
|
FullSourceLoc Loc;
|
|
|
|
|
|
|
|
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
|
2014-05-18 07:10:59 +08:00
|
|
|
UcnLen, Loc, nullptr, Features, true)) {
|
2012-06-13 13:37:23 +08:00
|
|
|
HadError = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
|
|
|
|
if (CharByteWidth == 2)
|
|
|
|
return UcnVal <= 0xFFFF ? 2 : 4;
|
|
|
|
|
|
|
|
// UTF-8.
|
|
|
|
if (UcnVal < 0x80)
|
|
|
|
return 1;
|
|
|
|
if (UcnVal < 0x800)
|
|
|
|
return 2;
|
|
|
|
if (UcnVal < 0x10000)
|
|
|
|
return 3;
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
|
2010-10-09 08:27:47 +08:00
|
|
|
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
|
|
|
|
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
|
|
|
|
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
|
|
|
|
/// we will likely rework our support for UCN's.
|
2012-03-10 06:27:51 +08:00
|
|
|
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
|
|
|
const char *ThisTokEnd,
|
2010-11-17 15:12:42 +08:00
|
|
|
char *&ResultBuf, bool &HadError,
|
2011-07-27 13:40:30 +08:00
|
|
|
FullSourceLoc Loc, unsigned CharByteWidth,
|
2011-09-26 07:23:43 +08:00
|
|
|
DiagnosticsEngine *Diags,
|
|
|
|
const LangOptions &Features) {
|
2010-10-09 08:27:47 +08:00
|
|
|
typedef uint32_t UTF32;
|
|
|
|
UTF32 UcnVal = 0;
|
|
|
|
unsigned short UcnLen = 0;
|
2012-03-10 06:27:51 +08:00
|
|
|
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
|
|
|
|
Loc, Diags, Features, true)) {
|
2012-06-13 13:37:23 +08:00
|
|
|
HadError = true;
|
2009-03-31 07:46:03 +08:00
|
|
|
return;
|
|
|
|
}
|
2010-10-09 08:27:47 +08:00
|
|
|
|
2013-09-19 07:23:13 +08:00
|
|
|
assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
|
2011-07-27 13:40:30 +08:00
|
|
|
"only character widths of 1, 2, or 4 bytes supported");
|
2010-10-06 12:57:26 +08:00
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
(void)UcnLen;
|
|
|
|
assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
|
|
|
|
|
|
|
|
if (CharByteWidth == 4) {
|
2011-11-03 07:06:23 +08:00
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
|
|
|
|
*ResultPtr = UcnVal;
|
|
|
|
ResultBuf += 4;
|
2011-07-27 13:40:30 +08:00
|
|
|
return;
|
|
|
|
}
|
2010-10-06 12:57:26 +08:00
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
if (CharByteWidth == 2) {
|
2011-11-03 07:06:23 +08:00
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
|
|
|
|
|
2012-06-13 13:41:29 +08:00
|
|
|
if (UcnVal <= (UTF32)0xFFFF) {
|
2011-11-03 07:06:23 +08:00
|
|
|
*ResultPtr = UcnVal;
|
|
|
|
ResultBuf += 2;
|
2010-10-06 12:57:26 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-11-03 07:06:23 +08:00
|
|
|
// Convert to UTF16.
|
2010-10-06 12:57:26 +08:00
|
|
|
UcnVal -= 0x10000;
|
2011-11-03 07:06:23 +08:00
|
|
|
*ResultPtr = 0xD800 + (UcnVal >> 10);
|
|
|
|
*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
|
|
|
|
ResultBuf += 4;
|
2010-09-01 07:34:27 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-07-27 13:40:30 +08:00
|
|
|
|
|
|
|
assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
|
|
|
|
|
2009-03-31 07:46:03 +08:00
|
|
|
// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
|
|
|
|
// The conversion below was inspired by:
|
|
|
|
// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
|
2009-09-09 23:08:12 +08:00
|
|
|
// First, we determine how many bytes the result will require.
|
2009-04-01 19:09:15 +08:00
|
|
|
typedef uint8_t UTF8;
|
2009-03-31 07:46:03 +08:00
|
|
|
|
|
|
|
unsigned short bytesToWrite = 0;
|
|
|
|
if (UcnVal < (UTF32)0x80)
|
|
|
|
bytesToWrite = 1;
|
|
|
|
else if (UcnVal < (UTF32)0x800)
|
|
|
|
bytesToWrite = 2;
|
|
|
|
else if (UcnVal < (UTF32)0x10000)
|
|
|
|
bytesToWrite = 3;
|
|
|
|
else
|
|
|
|
bytesToWrite = 4;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-03-31 07:46:03 +08:00
|
|
|
const unsigned byteMask = 0xBF;
|
|
|
|
const unsigned byteMark = 0x80;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-03-31 07:46:03 +08:00
|
|
|
// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
|
2009-03-31 18:29:45 +08:00
|
|
|
// into the first byte, depending on how many bytes follow.
|
2009-09-09 23:08:12 +08:00
|
|
|
static const UTF8 firstByteMark[5] = {
|
2009-03-31 18:29:45 +08:00
|
|
|
0x00, 0x00, 0xC0, 0xE0, 0xF0
|
2009-03-31 07:46:03 +08:00
|
|
|
};
|
|
|
|
// Finally, we write the bytes into ResultBuf.
|
|
|
|
ResultBuf += bytesToWrite;
|
|
|
|
switch (bytesToWrite) { // note: everything falls through.
|
2012-11-09 03:22:26 +08:00
|
|
|
case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
|
|
|
|
case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
|
|
|
|
case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
|
|
|
|
case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
|
2009-03-31 07:46:03 +08:00
|
|
|
}
|
|
|
|
// Update the buffer.
|
|
|
|
ResultBuf += bytesToWrite;
|
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
|
|
|
|
|
2007-03-10 07:16:33 +08:00
|
|
|
/// integer-constant: [C99 6.4.4.1]
|
|
|
|
/// decimal-constant integer-suffix
|
|
|
|
/// octal-constant integer-suffix
|
|
|
|
/// hexadecimal-constant integer-suffix
|
2013-07-23 16:14:48 +08:00
|
|
|
/// binary-literal integer-suffix [GNU, C++1y]
|
2012-03-09 05:59:28 +08:00
|
|
|
/// user-defined-integer-literal: [C++11 lex.ext]
|
2012-03-08 16:45:32 +08:00
|
|
|
/// decimal-literal ud-suffix
|
|
|
|
/// octal-literal ud-suffix
|
|
|
|
/// hexadecimal-literal ud-suffix
|
2013-07-23 16:14:48 +08:00
|
|
|
/// binary-literal ud-suffix [GNU, C++1y]
|
2009-09-09 23:08:12 +08:00
|
|
|
/// decimal-constant:
|
2007-03-10 07:16:33 +08:00
|
|
|
/// nonzero-digit
|
|
|
|
/// decimal-constant digit
|
2009-09-09 23:08:12 +08:00
|
|
|
/// octal-constant:
|
2007-03-10 07:16:33 +08:00
|
|
|
/// 0
|
|
|
|
/// octal-constant octal-digit
|
2009-09-09 23:08:12 +08:00
|
|
|
/// hexadecimal-constant:
|
2007-03-10 07:16:33 +08:00
|
|
|
/// hexadecimal-prefix hexadecimal-digit
|
|
|
|
/// hexadecimal-constant hexadecimal-digit
|
|
|
|
/// hexadecimal-prefix: one of
|
|
|
|
/// 0x 0X
|
2013-07-23 16:14:48 +08:00
|
|
|
/// binary-literal:
|
|
|
|
/// 0b binary-digit
|
|
|
|
/// 0B binary-digit
|
|
|
|
/// binary-literal binary-digit
|
2007-03-10 07:16:33 +08:00
|
|
|
/// integer-suffix:
|
|
|
|
/// unsigned-suffix [long-suffix]
|
|
|
|
/// unsigned-suffix [long-long-suffix]
|
|
|
|
/// long-suffix [unsigned-suffix]
|
|
|
|
/// long-long-suffix [unsigned-sufix]
|
|
|
|
/// nonzero-digit:
|
|
|
|
/// 1 2 3 4 5 6 7 8 9
|
|
|
|
/// octal-digit:
|
|
|
|
/// 0 1 2 3 4 5 6 7
|
|
|
|
/// hexadecimal-digit:
|
|
|
|
/// 0 1 2 3 4 5 6 7 8 9
|
|
|
|
/// a b c d e f
|
|
|
|
/// A B C D E F
|
2013-07-23 16:14:48 +08:00
|
|
|
/// binary-digit:
|
|
|
|
/// 0
|
|
|
|
/// 1
|
2007-03-10 07:16:33 +08:00
|
|
|
/// unsigned-suffix: one of
|
|
|
|
/// u U
|
|
|
|
/// long-suffix: one of
|
|
|
|
/// l L
|
2009-09-09 23:08:12 +08:00
|
|
|
/// long-long-suffix: one of
|
2007-03-10 07:16:33 +08:00
|
|
|
/// ll LL
|
|
|
|
///
|
|
|
|
/// floating-constant: [C99 6.4.4.2]
|
|
|
|
/// TODO: add rules...
|
|
|
|
///
|
2012-09-24 17:53:54 +08:00
|
|
|
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
|
|
|
|
SourceLocation TokLoc,
|
|
|
|
Preprocessor &PP)
|
|
|
|
: PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2008-10-01 04:45:40 +08:00
|
|
|
// This routine assumes that the range begin/end matches the regex for integer
|
|
|
|
// and FP constants (specifically, the 'pp-number' regex), and assumes that
|
|
|
|
// the byte at "*end" is both valid and not part of the regex. Because of
|
|
|
|
// this, it doesn't have to check for 'overscan' in various places.
|
2013-02-09 06:30:41 +08:00
|
|
|
assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2012-09-24 17:53:54 +08:00
|
|
|
s = DigitsBegin = ThisTokBegin;
|
2007-03-10 07:16:33 +08:00
|
|
|
saw_exponent = false;
|
|
|
|
saw_period = false;
|
2012-03-08 16:45:32 +08:00
|
|
|
saw_ud_suffix = false;
|
2007-03-10 07:16:33 +08:00
|
|
|
isLong = false;
|
|
|
|
isUnsigned = false;
|
|
|
|
isLongLong = false;
|
2007-08-26 11:29:23 +08:00
|
|
|
isFloat = false;
|
2007-08-26 09:58:14 +08:00
|
|
|
isImaginary = false;
|
2014-06-22 02:46:07 +08:00
|
|
|
MicrosoftInteger = 0;
|
2007-03-10 07:16:33 +08:00
|
|
|
hadError = false;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-10 07:16:33 +08:00
|
|
|
if (*s == '0') { // parse radix
|
2008-06-30 14:39:54 +08:00
|
|
|
ParseNumberStartingWithZero(TokLoc);
|
|
|
|
if (hadError)
|
|
|
|
return;
|
2007-03-10 07:16:33 +08:00
|
|
|
} else { // the first digit is non-zero
|
|
|
|
radix = 10;
|
|
|
|
s = SkipDigits(s);
|
|
|
|
if (s == ThisTokEnd) {
|
2007-06-09 01:12:06 +08:00
|
|
|
// Done.
|
2016-01-28 13:22:54 +08:00
|
|
|
} else {
|
|
|
|
ParseDecimalOrOctalCommon(TokLoc);
|
|
|
|
if (hadError)
|
2008-04-21 02:41:46 +08:00
|
|
|
return;
|
2007-03-10 07:16:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SuffixBegin = s;
|
2013-09-26 12:19:11 +08:00
|
|
|
checkSeparator(TokLoc, s, CSK_AfterDigits);
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-08-26 09:58:14 +08:00
|
|
|
// Parse the suffix. At this point we can classify whether we have an FP or
|
|
|
|
// integer constant.
|
|
|
|
bool isFPConstant = isFloatingLiteral();
|
2014-05-18 07:10:59 +08:00
|
|
|
const char *ImaginarySuffixLoc = nullptr;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-08-26 09:58:14 +08:00
|
|
|
// Loop over all of the characters of the suffix. If we see something bad,
|
|
|
|
// we break out of the loop.
|
|
|
|
for (; s != ThisTokEnd; ++s) {
|
|
|
|
switch (*s) {
|
|
|
|
case 'f': // FP Suffix for "float"
|
|
|
|
case 'F':
|
|
|
|
if (!isFPConstant) break; // Error for integer constant.
|
2007-08-26 11:29:23 +08:00
|
|
|
if (isFloat || isLong) break; // FF, LF invalid.
|
|
|
|
isFloat = true;
|
2007-08-26 09:58:14 +08:00
|
|
|
continue; // Success.
|
|
|
|
case 'u':
|
|
|
|
case 'U':
|
|
|
|
if (isFPConstant) break; // Error for floating constant.
|
|
|
|
if (isUnsigned) break; // Cannot be repeated.
|
|
|
|
isUnsigned = true;
|
|
|
|
continue; // Success.
|
|
|
|
case 'l':
|
|
|
|
case 'L':
|
|
|
|
if (isLong || isLongLong) break; // Cannot be repeated.
|
2007-08-26 11:29:23 +08:00
|
|
|
if (isFloat) break; // LF invalid.
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-08-26 09:58:14 +08:00
|
|
|
// Check for long long. The L's need to be adjacent and the same case.
|
2015-03-29 22:11:22 +08:00
|
|
|
if (s[1] == s[0]) {
|
|
|
|
assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
|
2007-08-26 09:58:14 +08:00
|
|
|
if (isFPConstant) break; // long long invalid for floats.
|
|
|
|
isLongLong = true;
|
|
|
|
++s; // Eat both of them.
|
|
|
|
} else {
|
2007-03-10 07:16:33 +08:00
|
|
|
isLong = true;
|
|
|
|
}
|
2007-08-26 09:58:14 +08:00
|
|
|
continue; // Success.
|
|
|
|
case 'i':
|
2010-10-14 08:24:10 +08:00
|
|
|
case 'I':
|
2012-03-11 15:00:24 +08:00
|
|
|
if (PP.getLangOpts().MicrosoftExt) {
|
2014-06-22 02:46:07 +08:00
|
|
|
if (isLong || isLongLong || MicrosoftInteger)
|
|
|
|
break;
|
2009-11-28 21:37:52 +08:00
|
|
|
|
2015-03-29 22:11:22 +08:00
|
|
|
if (!isFPConstant) {
|
2015-07-26 17:02:26 +08:00
|
|
|
// Allow i8, i16, i32, and i64.
|
2009-10-09 06:55:36 +08:00
|
|
|
switch (s[1]) {
|
2015-03-29 22:11:22 +08:00
|
|
|
case '8':
|
|
|
|
s += 2; // i8 suffix
|
|
|
|
MicrosoftInteger = 8;
|
|
|
|
break;
|
|
|
|
case '1':
|
|
|
|
if (s[2] == '6') {
|
|
|
|
s += 3; // i16 suffix
|
|
|
|
MicrosoftInteger = 16;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '3':
|
|
|
|
if (s[2] == '2') {
|
|
|
|
s += 3; // i32 suffix
|
|
|
|
MicrosoftInteger = 32;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '6':
|
|
|
|
if (s[2] == '4') {
|
|
|
|
s += 3; // i64 suffix
|
|
|
|
MicrosoftInteger = 64;
|
|
|
|
}
|
2014-05-30 07:10:15 +08:00
|
|
|
break;
|
2015-03-29 22:11:22 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (MicrosoftInteger) {
|
|
|
|
assert(s <= ThisTokEnd && "didn't maximally munch?");
|
|
|
|
break;
|
2008-04-05 05:02:54 +08:00
|
|
|
}
|
|
|
|
}
|
2013-09-24 12:06:10 +08:00
|
|
|
// "i", "if", and "il" are user-defined suffixes in C++1y.
|
2015-03-29 22:11:22 +08:00
|
|
|
if (*s == 'i' && PP.getLangOpts().CPlusPlus14)
|
2013-09-24 12:06:10 +08:00
|
|
|
break;
|
2008-04-05 05:02:54 +08:00
|
|
|
// fall through.
|
2007-08-26 09:58:14 +08:00
|
|
|
case 'j':
|
|
|
|
case 'J':
|
|
|
|
if (isImaginary) break; // Cannot be repeated.
|
|
|
|
isImaginary = true;
|
2013-07-23 16:14:48 +08:00
|
|
|
ImaginarySuffixLoc = s;
|
2007-08-26 09:58:14 +08:00
|
|
|
continue; // Success.
|
2007-03-10 07:16:33 +08:00
|
|
|
}
|
2012-03-08 16:45:32 +08:00
|
|
|
// If we reached here, there was an error or a ud-suffix.
|
2007-08-26 09:58:14 +08:00
|
|
|
break;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-08-26 09:58:14 +08:00
|
|
|
if (s != ThisTokEnd) {
|
2014-02-18 05:52:30 +08:00
|
|
|
// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
|
|
|
|
expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
|
|
|
|
if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
|
2013-07-23 16:14:48 +08:00
|
|
|
// Any suffix pieces we might have parsed are actually part of the
|
|
|
|
// ud-suffix.
|
|
|
|
isLong = false;
|
|
|
|
isUnsigned = false;
|
|
|
|
isLongLong = false;
|
|
|
|
isFloat = false;
|
|
|
|
isImaginary = false;
|
2014-06-22 02:46:07 +08:00
|
|
|
MicrosoftInteger = 0;
|
2013-07-23 16:14:48 +08:00
|
|
|
|
2012-03-08 16:45:32 +08:00
|
|
|
saw_ud_suffix = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Report an error if there are any.
|
2012-09-24 17:53:54 +08:00
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
|
2015-11-12 15:36:50 +08:00
|
|
|
diag::err_invalid_suffix_constant)
|
|
|
|
<< StringRef(SuffixBegin, ThisTokEnd-SuffixBegin) << isFPConstant;
|
2008-11-22 15:23:31 +08:00
|
|
|
hadError = true;
|
2007-08-26 09:58:14 +08:00
|
|
|
return;
|
2007-03-10 07:16:33 +08:00
|
|
|
}
|
2013-07-23 16:14:48 +08:00
|
|
|
|
|
|
|
if (isImaginary) {
|
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
|
|
|
|
ImaginarySuffixLoc - ThisTokBegin),
|
|
|
|
diag::ext_imaginary_constant);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-28 13:22:54 +08:00
|
|
|
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
|
|
|
|
/// numbers. It issues an error for illegal digits, and handles floating point
|
|
|
|
/// parsing. If it detects a floating point number, the radix is set to 10.
|
|
|
|
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
|
|
|
|
assert((radix == 8 || radix == 10) && "Unexpected radix");
|
|
|
|
|
|
|
|
// If we have a hex digit other than 'e' (which denotes a FP exponent) then
|
|
|
|
// the code is using an incorrect base.
|
|
|
|
if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
|
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
|
|
|
|
diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
|
|
|
|
hadError = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*s == '.') {
|
|
|
|
checkSeparator(TokLoc, s, CSK_AfterDigits);
|
|
|
|
s++;
|
|
|
|
radix = 10;
|
|
|
|
saw_period = true;
|
|
|
|
checkSeparator(TokLoc, s, CSK_BeforeDigits);
|
|
|
|
s = SkipDigits(s); // Skip suffix.
|
|
|
|
}
|
|
|
|
if (*s == 'e' || *s == 'E') { // exponent
|
|
|
|
checkSeparator(TokLoc, s, CSK_AfterDigits);
|
|
|
|
const char *Exponent = s;
|
|
|
|
s++;
|
|
|
|
radix = 10;
|
|
|
|
saw_exponent = true;
|
|
|
|
if (*s == '+' || *s == '-') s++; // sign
|
|
|
|
const char *first_non_digit = SkipDigits(s);
|
|
|
|
if (first_non_digit != s) {
|
|
|
|
checkSeparator(TokLoc, s, CSK_BeforeDigits);
|
|
|
|
s = first_non_digit;
|
|
|
|
} else {
|
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
|
|
|
|
diag::err_exponent_has_no_digits);
|
|
|
|
hadError = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-23 16:14:48 +08:00
|
|
|
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
|
|
|
|
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
|
|
|
|
/// treat it as an invalid suffix.
|
|
|
|
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
|
|
|
|
StringRef Suffix) {
|
|
|
|
if (!LangOpts.CPlusPlus11 || Suffix.empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
|
|
|
|
if (Suffix[0] == '_')
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// In C++11, there are no library suffixes.
|
2014-08-19 23:55:55 +08:00
|
|
|
if (!LangOpts.CPlusPlus14)
|
2013-07-23 16:14:48 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
|
2013-09-24 12:06:10 +08:00
|
|
|
// Per tweaked N3660, "il", "i", and "if" are also used in the library.
|
2013-07-23 16:14:48 +08:00
|
|
|
return llvm::StringSwitch<bool>(Suffix)
|
|
|
|
.Cases("h", "min", "s", true)
|
|
|
|
.Cases("ms", "us", "ns", true)
|
2013-09-24 12:06:10 +08:00
|
|
|
.Cases("il", "i", "if", true)
|
2013-07-23 16:14:48 +08:00
|
|
|
.Default(false);
|
2007-03-10 07:16:33 +08:00
|
|
|
}
|
|
|
|
|
2013-09-26 11:33:06 +08:00
|
|
|
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
|
2013-09-26 12:19:11 +08:00
|
|
|
const char *Pos,
|
|
|
|
CheckSeparatorKind IsAfterDigits) {
|
|
|
|
if (IsAfterDigits == CSK_AfterDigits) {
|
2013-09-26 13:57:03 +08:00
|
|
|
if (Pos == ThisTokBegin)
|
|
|
|
return;
|
2013-09-26 11:33:06 +08:00
|
|
|
--Pos;
|
2013-09-26 13:57:03 +08:00
|
|
|
} else if (Pos == ThisTokEnd)
|
|
|
|
return;
|
2013-09-26 11:33:06 +08:00
|
|
|
|
|
|
|
if (isDigitSeparator(*Pos))
|
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
|
|
|
|
diag::err_digit_separator_not_between_digits)
|
|
|
|
<< IsAfterDigits;
|
|
|
|
}
|
|
|
|
|
2008-06-30 14:39:54 +08:00
|
|
|
/// ParseNumberStartingWithZero - This method is called when the first character
|
|
|
|
/// of the number is found to be a zero. This means it is either an octal
|
|
|
|
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
|
2009-09-09 23:08:12 +08:00
|
|
|
/// a floating point number (01239.123e4). Eat the prefix, determining the
|
2008-06-30 14:39:54 +08:00
|
|
|
/// radix etc.
|
|
|
|
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
|
|
|
|
assert(s[0] == '0' && "Invalid method call");
|
|
|
|
s++;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2013-09-27 12:42:28 +08:00
|
|
|
int c1 = s[0];
|
|
|
|
|
2008-06-30 14:39:54 +08:00
|
|
|
// Handle a hex number like 0x1234.
|
2015-03-29 22:11:37 +08:00
|
|
|
if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
|
2008-06-30 14:39:54 +08:00
|
|
|
s++;
|
2015-03-29 22:11:37 +08:00
|
|
|
assert(s < ThisTokEnd && "didn't maximally munch?");
|
2008-06-30 14:39:54 +08:00
|
|
|
radix = 16;
|
|
|
|
DigitsBegin = s;
|
|
|
|
s = SkipHexDigits(s);
|
2012-02-08 21:36:33 +08:00
|
|
|
bool noSignificand = (s == DigitsBegin);
|
2008-06-30 14:39:54 +08:00
|
|
|
if (s == ThisTokEnd) {
|
|
|
|
// Done.
|
|
|
|
} else if (*s == '.') {
|
|
|
|
s++;
|
|
|
|
saw_period = true;
|
2012-02-08 21:36:33 +08:00
|
|
|
const char *floatDigitsBegin = s;
|
2014-04-23 07:50:25 +08:00
|
|
|
checkSeparator(TokLoc, s, CSK_BeforeDigits);
|
2008-06-30 14:39:54 +08:00
|
|
|
s = SkipHexDigits(s);
|
2012-02-08 21:36:33 +08:00
|
|
|
noSignificand &= (floatDigitsBegin == s);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (noSignificand) {
|
2012-09-24 17:53:54 +08:00
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
|
2015-11-14 10:09:55 +08:00
|
|
|
diag::err_hexconstant_requires) << 1;
|
2012-02-08 21:36:33 +08:00
|
|
|
hadError = true;
|
|
|
|
return;
|
2008-06-30 14:39:54 +08:00
|
|
|
}
|
2012-02-08 21:36:33 +08:00
|
|
|
|
2008-06-30 14:39:54 +08:00
|
|
|
// A binary exponent can appear with or with a '.'. If dotted, the
|
2009-09-09 23:08:12 +08:00
|
|
|
// binary exponent is required.
|
2011-08-31 06:40:35 +08:00
|
|
|
if (*s == 'p' || *s == 'P') {
|
2014-04-23 07:50:25 +08:00
|
|
|
checkSeparator(TokLoc, s, CSK_AfterDigits);
|
2008-06-30 14:39:54 +08:00
|
|
|
const char *Exponent = s;
|
|
|
|
s++;
|
|
|
|
saw_exponent = true;
|
|
|
|
if (*s == '+' || *s == '-') s++; // sign
|
|
|
|
const char *first_non_digit = SkipDigits(s);
|
2008-07-26 02:18:34 +08:00
|
|
|
if (first_non_digit == s) {
|
2008-11-22 15:23:31 +08:00
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
|
|
|
|
diag::err_exponent_has_no_digits);
|
|
|
|
hadError = true;
|
2008-07-26 02:18:34 +08:00
|
|
|
return;
|
2008-06-30 14:39:54 +08:00
|
|
|
}
|
2014-04-23 07:50:25 +08:00
|
|
|
checkSeparator(TokLoc, s, CSK_BeforeDigits);
|
2008-07-26 02:18:34 +08:00
|
|
|
s = first_non_digit;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2012-03-11 15:00:24 +08:00
|
|
|
if (!PP.getLangOpts().HexFloats)
|
2008-11-22 15:23:31 +08:00
|
|
|
PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
|
2008-06-30 14:39:54 +08:00
|
|
|
} else if (saw_period) {
|
2008-11-22 15:23:31 +08:00
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
|
2015-11-14 10:09:55 +08:00
|
|
|
diag::err_hexconstant_requires) << 0;
|
2008-11-22 15:23:31 +08:00
|
|
|
hadError = true;
|
2008-06-30 14:39:54 +08:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2008-06-30 14:39:54 +08:00
|
|
|
// Handle simple binary numbers 0b01010
|
2015-03-29 22:11:37 +08:00
|
|
|
if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
|
2013-04-20 04:47:20 +08:00
|
|
|
// 0b101010 is a C++1y / GCC extension.
|
|
|
|
PP.Diag(TokLoc,
|
2014-08-19 23:55:55 +08:00
|
|
|
PP.getLangOpts().CPlusPlus14
|
2013-04-20 04:47:20 +08:00
|
|
|
? diag::warn_cxx11_compat_binary_literal
|
|
|
|
: PP.getLangOpts().CPlusPlus
|
2014-08-19 23:55:55 +08:00
|
|
|
? diag::ext_binary_literal_cxx14
|
2013-04-20 04:47:20 +08:00
|
|
|
: diag::ext_binary_literal);
|
2008-06-30 14:39:54 +08:00
|
|
|
++s;
|
2015-03-29 22:11:37 +08:00
|
|
|
assert(s < ThisTokEnd && "didn't maximally munch?");
|
2008-06-30 14:39:54 +08:00
|
|
|
radix = 2;
|
|
|
|
DigitsBegin = s;
|
|
|
|
s = SkipBinaryDigits(s);
|
|
|
|
if (s == ThisTokEnd) {
|
|
|
|
// Done.
|
2013-02-09 06:30:41 +08:00
|
|
|
} else if (isHexDigit(*s)) {
|
2008-11-22 15:23:31 +08:00
|
|
|
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
|
2015-11-14 10:09:55 +08:00
|
|
|
diag::err_invalid_digit) << StringRef(s, 1) << 2;
|
2008-11-22 15:23:31 +08:00
|
|
|
hadError = true;
|
2008-06-30 14:39:54 +08:00
|
|
|
}
|
2008-06-30 14:44:49 +08:00
|
|
|
// Other suffixes will be diagnosed by the caller.
|
2008-06-30 14:39:54 +08:00
|
|
|
return;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2008-06-30 14:39:54 +08:00
|
|
|
// For now, the radix is set to 8. If we discover that we have a
|
|
|
|
// floating point constant, the radix will change to 10. Octal floating
|
2009-09-09 23:08:12 +08:00
|
|
|
// point constants are not permitted (only decimal and hexadecimal).
|
2008-06-30 14:39:54 +08:00
|
|
|
radix = 8;
|
|
|
|
DigitsBegin = s;
|
|
|
|
s = SkipOctalDigits(s);
|
|
|
|
if (s == ThisTokEnd)
|
|
|
|
return; // Done, simple octal number like 01234
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2008-06-30 14:44:49 +08:00
|
|
|
// If we have some other non-octal digit that *is* a decimal digit, see if
|
|
|
|
// this is part of a floating point number like 094.123 or 09e1.
|
2013-02-09 06:30:41 +08:00
|
|
|
if (isDigit(*s)) {
|
2008-06-30 14:44:49 +08:00
|
|
|
const char *EndDecimal = SkipDigits(s);
|
|
|
|
if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
|
|
|
|
s = EndDecimal;
|
|
|
|
radix = 10;
|
|
|
|
}
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2016-01-28 13:22:54 +08:00
|
|
|
ParseDecimalOrOctalCommon(TokLoc);
|
2008-06-30 14:39:54 +08:00
|
|
|
}
|
|
|
|
|
Rename CanFitInto64Bits to alwaysFitsInto64Bits per discussion on IRC.
This makes the behavior clearer concerning literals with the maximum
number of digits. For a 32-bit example, 4,000,000,000 is a valid uint32_t,
but 5,000,000,000 is not, so we'd have to count 10-digit decimal numbers
as "unsafe" (meaning we have to check for overflow when parsing them,
just as we would for numbers with 11 digits or higher). This is the same,
only with 64 bits to play with.
No functionality change.
llvm-svn: 164639
2012-09-26 06:32:51 +08:00
|
|
|
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
|
2012-09-26 03:09:15 +08:00
|
|
|
switch (Radix) {
|
|
|
|
case 2:
|
|
|
|
return NumDigits <= 64;
|
|
|
|
case 8:
|
|
|
|
return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
|
|
|
|
case 10:
|
|
|
|
return NumDigits <= 19; // floor(log10(2^64))
|
|
|
|
case 16:
|
|
|
|
return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
|
|
|
|
default:
|
|
|
|
llvm_unreachable("impossible Radix");
|
|
|
|
}
|
|
|
|
}
|
2008-06-30 14:39:54 +08:00
|
|
|
|
2007-04-04 13:52:58 +08:00
|
|
|
/// GetIntegerValue - Convert this numeric literal value to an APInt that
|
2007-04-04 14:36:34 +08:00
|
|
|
/// matches Val's input width. If there is an overflow, set Val to the low bits
|
|
|
|
/// of the result and return true. Otherwise, return false.
|
2007-06-16 07:05:46 +08:00
|
|
|
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
|
2008-10-16 15:32:01 +08:00
|
|
|
// Fast path: Compute a conservative bound on the maximum number of
|
|
|
|
// bits per digit in this radix. If we can't possibly overflow a
|
|
|
|
// uint64 based on that bound then do the simple conversion to
|
|
|
|
// integer. This avoids the expensive overflow checking below, and
|
|
|
|
// handles the common cases that matter (small decimal integers and
|
|
|
|
// hex/octal values which don't overflow).
|
2012-09-26 03:09:15 +08:00
|
|
|
const unsigned NumDigits = SuffixBegin - DigitsBegin;
|
Rename CanFitInto64Bits to alwaysFitsInto64Bits per discussion on IRC.
This makes the behavior clearer concerning literals with the maximum
number of digits. For a 32-bit example, 4,000,000,000 is a valid uint32_t,
but 5,000,000,000 is not, so we'd have to count 10-digit decimal numbers
as "unsafe" (meaning we have to check for overflow when parsing them,
just as we would for numbers with 11 digits or higher). This is the same,
only with 64 bits to play with.
No functionality change.
llvm-svn: 164639
2012-09-26 06:32:51 +08:00
|
|
|
if (alwaysFitsInto64Bits(radix, NumDigits)) {
|
2008-10-16 15:32:01 +08:00
|
|
|
uint64_t N = 0;
|
2012-09-26 03:09:15 +08:00
|
|
|
for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
|
2013-09-26 11:33:06 +08:00
|
|
|
if (!isDigitSeparator(*Ptr))
|
|
|
|
N = N * radix + llvm::hexDigitValue(*Ptr);
|
2008-10-16 15:32:01 +08:00
|
|
|
|
|
|
|
// This will truncate the value to Val's input width. Simply check
|
|
|
|
// for overflow by comparing.
|
|
|
|
Val = N;
|
|
|
|
return Val.getZExtValue() != N;
|
|
|
|
}
|
|
|
|
|
2007-04-04 13:52:58 +08:00
|
|
|
Val = 0;
|
2012-09-26 03:09:15 +08:00
|
|
|
const char *Ptr = DigitsBegin;
|
2007-04-04 13:52:58 +08:00
|
|
|
|
2007-06-16 07:05:46 +08:00
|
|
|
llvm::APInt RadixVal(Val.getBitWidth(), radix);
|
|
|
|
llvm::APInt CharVal(Val.getBitWidth(), 0);
|
|
|
|
llvm::APInt OldVal = Val;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-04 14:36:34 +08:00
|
|
|
bool OverflowOccurred = false;
|
2012-09-26 03:09:15 +08:00
|
|
|
while (Ptr < SuffixBegin) {
|
2013-09-26 11:33:06 +08:00
|
|
|
if (isDigitSeparator(*Ptr)) {
|
|
|
|
++Ptr;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-01-19 06:33:58 +08:00
|
|
|
unsigned C = llvm::hexDigitValue(*Ptr++);
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-04 13:52:58 +08:00
|
|
|
// If this letter is out of bound for this radix, reject it.
|
2007-04-04 14:49:26 +08:00
|
|
|
assert(C < radix && "NumericLiteralParser ctor should have rejected this");
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-04 13:52:58 +08:00
|
|
|
CharVal = C;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-04 14:36:34 +08:00
|
|
|
// Add the digit to the value in the appropriate radix. If adding in digits
|
|
|
|
// made the value smaller, then this overflowed.
|
2007-04-04 13:52:58 +08:00
|
|
|
OldVal = Val;
|
2007-04-04 14:36:34 +08:00
|
|
|
|
|
|
|
// Multiply by radix, did overflow occur on the multiply?
|
2007-04-04 13:52:58 +08:00
|
|
|
Val *= RadixVal;
|
2007-04-04 14:36:34 +08:00
|
|
|
OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
|
|
|
|
|
|
|
|
// Add value, did overflow occur on the value?
|
2008-10-16 14:39:30 +08:00
|
|
|
// (a + b) ult b <=> overflow
|
2007-04-04 13:52:58 +08:00
|
|
|
Val += CharVal;
|
2007-04-04 14:36:34 +08:00
|
|
|
OverflowOccurred |= Val.ult(CharVal);
|
2007-04-04 13:52:58 +08:00
|
|
|
}
|
2007-04-04 14:36:34 +08:00
|
|
|
return OverflowOccurred;
|
2007-04-04 13:52:58 +08:00
|
|
|
}
|
|
|
|
|
2009-12-24 17:08:04 +08:00
|
|
|
llvm::APFloat::opStatus
|
|
|
|
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
|
2007-11-27 07:12:30 +08:00
|
|
|
using llvm::APFloat;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-08-17 07:36:28 +08:00
|
|
|
unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
|
2013-09-26 11:33:06 +08:00
|
|
|
|
|
|
|
llvm::SmallString<16> Buffer;
|
|
|
|
StringRef Str(ThisTokBegin, n);
|
|
|
|
if (Str.find('\'') != StringRef::npos) {
|
|
|
|
Buffer.reserve(n);
|
|
|
|
std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
|
|
|
|
&isDigitSeparator);
|
|
|
|
Str = Buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
|
2007-07-10 07:53:58 +08:00
|
|
|
}
|
2007-04-04 13:52:58 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
|
2012-06-17 11:34:42 +08:00
|
|
|
/// \verbatim
|
2012-03-05 12:02:15 +08:00
|
|
|
/// user-defined-character-literal: [C++11 lex.ext]
|
|
|
|
/// character-literal ud-suffix
|
|
|
|
/// ud-suffix:
|
|
|
|
/// identifier
|
|
|
|
/// character-literal: [C++11 lex.ccon]
|
2011-08-11 12:06:15 +08:00
|
|
|
/// ' c-char-sequence '
|
|
|
|
/// u' c-char-sequence '
|
|
|
|
/// U' c-char-sequence '
|
|
|
|
/// L' c-char-sequence '
|
2016-01-08 04:59:26 +08:00
|
|
|
/// u8' c-char-sequence ' [C++1z lex.ccon]
|
2011-08-11 12:06:15 +08:00
|
|
|
/// c-char-sequence:
|
|
|
|
/// c-char
|
|
|
|
/// c-char-sequence c-char
|
|
|
|
/// c-char:
|
|
|
|
/// any member of the source character set except the single-quote ',
|
|
|
|
/// backslash \, or new-line character
|
|
|
|
/// escape-sequence
|
|
|
|
/// universal-character-name
|
2012-03-05 12:02:15 +08:00
|
|
|
/// escape-sequence:
|
2011-08-11 12:06:15 +08:00
|
|
|
/// simple-escape-sequence
|
|
|
|
/// octal-escape-sequence
|
|
|
|
/// hexadecimal-escape-sequence
|
|
|
|
/// simple-escape-sequence:
|
2011-08-12 13:49:51 +08:00
|
|
|
/// one of \' \" \? \\ \a \b \f \n \r \t \v
|
2011-08-11 12:06:15 +08:00
|
|
|
/// octal-escape-sequence:
|
|
|
|
/// \ octal-digit
|
|
|
|
/// \ octal-digit octal-digit
|
|
|
|
/// \ octal-digit octal-digit octal-digit
|
|
|
|
/// hexadecimal-escape-sequence:
|
|
|
|
/// \x hexadecimal-digit
|
|
|
|
/// hexadecimal-escape-sequence hexadecimal-digit
|
2012-03-05 12:02:15 +08:00
|
|
|
/// universal-character-name: [C++11 lex.charset]
|
2011-08-11 12:06:15 +08:00
|
|
|
/// \u hex-quad
|
|
|
|
/// \U hex-quad hex-quad
|
|
|
|
/// hex-quad:
|
|
|
|
/// hex-digit hex-digit hex-digit hex-digit
|
2012-06-17 11:34:42 +08:00
|
|
|
/// \endverbatim
|
2011-08-11 12:06:15 +08:00
|
|
|
///
|
2007-04-05 14:57:15 +08:00
|
|
|
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
|
2011-07-27 13:40:30 +08:00
|
|
|
SourceLocation Loc, Preprocessor &PP,
|
|
|
|
tok::TokenKind kind) {
|
2012-01-18 20:27:04 +08:00
|
|
|
// At this point we know that the character matches the regex "(L|u|U)?'.*'".
|
2007-04-05 14:57:15 +08:00
|
|
|
HadError = false;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
Kind = kind;
|
|
|
|
|
2012-03-10 06:27:51 +08:00
|
|
|
const char *TokBegin = begin;
|
|
|
|
|
2012-01-18 20:27:04 +08:00
|
|
|
// Skip over wide character determinant.
|
2014-11-08 14:08:42 +08:00
|
|
|
if (Kind != tok::char_constant)
|
|
|
|
++begin;
|
|
|
|
if (Kind == tok::utf8_char_constant)
|
2011-07-27 13:40:30 +08:00
|
|
|
++begin;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
// Skip over the entry quote.
|
|
|
|
assert(begin[0] == '\'' && "Invalid token lexed");
|
|
|
|
++begin;
|
|
|
|
|
2012-03-05 12:02:15 +08:00
|
|
|
// Remove an optional ud-suffix.
|
|
|
|
if (end[-1] != '\'') {
|
|
|
|
const char *UDSuffixEnd = end;
|
|
|
|
do {
|
|
|
|
--end;
|
|
|
|
} while (end[-1] != '\'');
|
2014-02-18 05:52:30 +08:00
|
|
|
// FIXME: Don't bother with this if !tok.hasUCN().
|
|
|
|
expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
|
2012-03-10 06:27:51 +08:00
|
|
|
UDSuffixOffset = end - TokBegin;
|
2012-03-05 12:02:15 +08:00
|
|
|
}
|
|
|
|
|
2012-01-18 20:27:04 +08:00
|
|
|
// Trim the ending quote.
|
2012-03-05 12:02:15 +08:00
|
|
|
assert(end != begin && "Invalid token lexed");
|
2012-01-18 20:27:04 +08:00
|
|
|
--end;
|
|
|
|
|
2009-09-09 23:08:12 +08:00
|
|
|
// FIXME: The "Value" is an uint64_t so we can handle char literals of
|
2011-04-15 13:22:18 +08:00
|
|
|
// up to 64-bits.
|
2007-04-05 14:57:15 +08:00
|
|
|
// FIXME: This extensively assumes that 'char' is 8-bits.
|
2008-03-06 02:54:05 +08:00
|
|
|
assert(PP.getTargetInfo().getCharWidth() == 8 &&
|
2007-04-05 14:57:15 +08:00
|
|
|
"Assumes char is 8 bits");
|
2009-04-29 05:51:46 +08:00
|
|
|
assert(PP.getTargetInfo().getIntWidth() <= 64 &&
|
|
|
|
(PP.getTargetInfo().getIntWidth() & 7) == 0 &&
|
|
|
|
"Assumes sizeof(int) on target is <= 64 and a multiple of char");
|
|
|
|
assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
|
|
|
|
"Assumes sizeof(wchar) on target is <= 64");
|
2009-04-21 10:21:29 +08:00
|
|
|
|
2013-08-21 10:40:19 +08:00
|
|
|
SmallVector<uint32_t, 4> codepoint_buffer;
|
|
|
|
codepoint_buffer.resize(end - begin);
|
2012-01-18 20:27:04 +08:00
|
|
|
uint32_t *buffer_begin = &codepoint_buffer.front();
|
|
|
|
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
|
|
|
|
|
|
|
|
// Unicode escapes representing characters that cannot be correctly
|
|
|
|
// represented in a single code unit are disallowed in character literals
|
|
|
|
// by this implementation.
|
|
|
|
uint32_t largest_character_for_kind;
|
|
|
|
if (tok::wide_char_constant == Kind) {
|
2013-08-21 10:40:19 +08:00
|
|
|
largest_character_for_kind =
|
2013-08-22 02:57:51 +08:00
|
|
|
0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
|
2014-11-08 14:08:42 +08:00
|
|
|
} else if (tok::utf8_char_constant == Kind) {
|
|
|
|
largest_character_for_kind = 0x7F;
|
2012-01-18 20:27:04 +08:00
|
|
|
} else if (tok::utf16_char_constant == Kind) {
|
|
|
|
largest_character_for_kind = 0xFFFF;
|
|
|
|
} else if (tok::utf32_char_constant == Kind) {
|
|
|
|
largest_character_for_kind = 0x10FFFF;
|
|
|
|
} else {
|
|
|
|
largest_character_for_kind = 0x7Fu;
|
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
|
2013-08-21 10:40:19 +08:00
|
|
|
while (begin != end) {
|
2012-01-18 20:27:04 +08:00
|
|
|
// Is this a span of non-escape characters?
|
|
|
|
if (begin[0] != '\\') {
|
|
|
|
char const *start = begin;
|
|
|
|
do {
|
|
|
|
++begin;
|
|
|
|
} while (begin != end && *begin != '\\');
|
|
|
|
|
2012-02-11 13:08:10 +08:00
|
|
|
char const *tmp_in_start = start;
|
|
|
|
uint32_t *tmp_out_start = buffer_begin;
|
2012-01-18 20:27:04 +08:00
|
|
|
ConversionResult res =
|
2013-08-21 10:40:19 +08:00
|
|
|
ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
|
|
|
|
reinterpret_cast<UTF8 const *>(begin),
|
|
|
|
&buffer_begin, buffer_end, strictConversion);
|
|
|
|
if (res != conversionOK) {
|
|
|
|
// If we see bad encoding for unprefixed character literals, warn and
|
|
|
|
// simply copy the byte values, for compatibility with gcc and
|
2012-02-11 13:08:10 +08:00
|
|
|
// older versions of clang.
|
|
|
|
bool NoErrorOnBadEncoding = isAscii();
|
|
|
|
unsigned Msg = diag::err_bad_character_encoding;
|
|
|
|
if (NoErrorOnBadEncoding)
|
|
|
|
Msg = diag::warn_bad_character_encoding;
|
2013-08-22 02:57:51 +08:00
|
|
|
PP.Diag(Loc, Msg);
|
2012-02-11 13:08:10 +08:00
|
|
|
if (NoErrorOnBadEncoding) {
|
|
|
|
start = tmp_in_start;
|
|
|
|
buffer_begin = tmp_out_start;
|
2013-08-21 10:40:19 +08:00
|
|
|
for (; start != begin; ++start, ++buffer_begin)
|
2012-02-11 13:08:10 +08:00
|
|
|
*buffer_begin = static_cast<uint8_t>(*start);
|
|
|
|
} else {
|
|
|
|
HadError = true;
|
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
} else {
|
2013-08-21 10:40:19 +08:00
|
|
|
for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
|
2012-02-11 13:08:10 +08:00
|
|
|
if (*tmp_out_start > largest_character_for_kind) {
|
2012-01-18 20:27:04 +08:00
|
|
|
HadError = true;
|
|
|
|
PP.Diag(Loc, diag::err_character_too_large);
|
|
|
|
}
|
2010-04-17 07:44:05 +08:00
|
|
|
}
|
2007-04-05 14:57:15 +08:00
|
|
|
}
|
2012-01-18 20:27:04 +08:00
|
|
|
|
|
|
|
continue;
|
2007-04-05 14:57:15 +08:00
|
|
|
}
|
2013-08-21 10:40:19 +08:00
|
|
|
// Is this a Universal Character Name escape?
|
2012-01-18 20:27:04 +08:00
|
|
|
if (begin[1] == 'u' || begin[1] == 'U') {
|
|
|
|
unsigned short UcnLen = 0;
|
2012-03-10 06:27:51 +08:00
|
|
|
if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
|
2012-01-18 20:27:04 +08:00
|
|
|
FullSourceLoc(Loc, PP.getSourceManager()),
|
2013-08-21 10:40:19 +08:00
|
|
|
&PP.getDiagnostics(), PP.getLangOpts(), true)) {
|
2012-01-18 20:27:04 +08:00
|
|
|
HadError = true;
|
|
|
|
} else if (*buffer_begin > largest_character_for_kind) {
|
|
|
|
HadError = true;
|
2012-09-08 15:16:20 +08:00
|
|
|
PP.Diag(Loc, diag::err_character_too_large);
|
2012-01-18 20:27:04 +08:00
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2012-01-18 20:27:04 +08:00
|
|
|
++buffer_begin;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
|
|
|
|
uint64_t result =
|
2012-09-08 15:16:20 +08:00
|
|
|
ProcessCharEscape(TokBegin, begin, end, HadError,
|
2013-08-22 02:57:51 +08:00
|
|
|
FullSourceLoc(Loc,PP.getSourceManager()),
|
2012-09-08 15:16:20 +08:00
|
|
|
CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
|
2012-01-18 20:27:04 +08:00
|
|
|
*buffer_begin++ = result;
|
2009-04-29 05:51:46 +08:00
|
|
|
}
|
|
|
|
|
2013-08-21 10:40:19 +08:00
|
|
|
unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
|
2012-01-18 20:27:04 +08:00
|
|
|
|
2009-04-29 05:51:46 +08:00
|
|
|
if (NumCharsSoFar > 1) {
|
2012-01-18 20:27:04 +08:00
|
|
|
if (isWide())
|
2011-07-27 13:40:30 +08:00
|
|
|
PP.Diag(Loc, diag::warn_extraneous_char_constant);
|
2012-01-18 20:27:04 +08:00
|
|
|
else if (isAscii() && NumCharsSoFar == 4)
|
|
|
|
PP.Diag(Loc, diag::ext_four_char_character_literal);
|
|
|
|
else if (isAscii())
|
2009-04-29 05:51:46 +08:00
|
|
|
PP.Diag(Loc, diag::ext_multichar_character_literal);
|
|
|
|
else
|
2012-01-18 20:27:04 +08:00
|
|
|
PP.Diag(Loc, diag::err_multichar_utf_character_literal);
|
2009-06-01 13:25:02 +08:00
|
|
|
IsMultiChar = true;
|
2013-08-21 10:40:19 +08:00
|
|
|
} else {
|
2009-07-29 09:46:05 +08:00
|
|
|
IsMultiChar = false;
|
2013-08-21 10:40:19 +08:00
|
|
|
}
|
2009-04-21 10:21:29 +08:00
|
|
|
|
2012-01-18 20:27:04 +08:00
|
|
|
llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
|
|
|
|
|
|
|
|
// Narrow character literals act as though their value is concatenated
|
|
|
|
// in this implementation, but warn on overflow.
|
|
|
|
bool multi_char_too_long = false;
|
|
|
|
if (isAscii() && isMultiChar()) {
|
|
|
|
LitVal = 0;
|
2013-08-21 10:40:19 +08:00
|
|
|
for (size_t i = 0; i < NumCharsSoFar; ++i) {
|
2012-01-18 20:27:04 +08:00
|
|
|
// check for enough leading zeros to shift into
|
|
|
|
multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
|
|
|
|
LitVal <<= 8;
|
|
|
|
LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
|
|
|
|
}
|
|
|
|
} else if (NumCharsSoFar > 0) {
|
|
|
|
// otherwise just take the last character
|
|
|
|
LitVal = buffer_begin[-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!HadError && multi_char_too_long) {
|
2013-08-21 10:40:19 +08:00
|
|
|
PP.Diag(Loc, diag::warn_char_constant_too_large);
|
2012-01-18 20:27:04 +08:00
|
|
|
}
|
|
|
|
|
2009-04-21 10:21:29 +08:00
|
|
|
// Transfer the value from APInt to uint64_t
|
|
|
|
Value = LitVal.getZExtValue();
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-04-05 14:57:15 +08:00
|
|
|
// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
|
|
|
|
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
|
|
|
|
// character constants are not sign extended in the this implementation:
|
|
|
|
// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
|
2011-07-27 13:40:30 +08:00
|
|
|
if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
|
2012-03-11 15:00:24 +08:00
|
|
|
PP.getLangOpts().CharIsSigned)
|
2007-04-05 14:57:15 +08:00
|
|
|
Value = (signed char)Value;
|
|
|
|
}
|
|
|
|
|
2012-06-20 05:04:25 +08:00
|
|
|
/// \verbatim
|
2011-08-11 12:06:15 +08:00
|
|
|
/// string-literal: [C++0x lex.string]
|
|
|
|
/// encoding-prefix " [s-char-sequence] "
|
|
|
|
/// encoding-prefix R raw-string
|
|
|
|
/// encoding-prefix:
|
|
|
|
/// u8
|
|
|
|
/// u
|
|
|
|
/// U
|
|
|
|
/// L
|
2007-03-14 06:37:02 +08:00
|
|
|
/// s-char-sequence:
|
|
|
|
/// s-char
|
|
|
|
/// s-char-sequence s-char
|
|
|
|
/// s-char:
|
2011-08-11 12:06:15 +08:00
|
|
|
/// any member of the source character set except the double-quote ",
|
|
|
|
/// backslash \, or new-line character
|
|
|
|
/// escape-sequence
|
2007-03-14 06:37:02 +08:00
|
|
|
/// universal-character-name
|
2011-08-11 12:06:15 +08:00
|
|
|
/// raw-string:
|
|
|
|
/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
|
|
|
|
/// r-char-sequence:
|
|
|
|
/// r-char
|
|
|
|
/// r-char-sequence r-char
|
|
|
|
/// r-char:
|
|
|
|
/// any member of the source character set, except a right parenthesis )
|
|
|
|
/// followed by the initial d-char-sequence (which may be empty)
|
|
|
|
/// followed by a double quote ".
|
|
|
|
/// d-char-sequence:
|
|
|
|
/// d-char
|
|
|
|
/// d-char-sequence d-char
|
|
|
|
/// d-char:
|
|
|
|
/// any member of the basic source character set except:
|
|
|
|
/// space, the left parenthesis (, the right parenthesis ),
|
|
|
|
/// the backslash \, and the control characters representing horizontal
|
|
|
|
/// tab, vertical tab, form feed, and newline.
|
|
|
|
/// escape-sequence: [C++0x lex.ccon]
|
|
|
|
/// simple-escape-sequence
|
|
|
|
/// octal-escape-sequence
|
|
|
|
/// hexadecimal-escape-sequence
|
|
|
|
/// simple-escape-sequence:
|
2011-08-12 13:49:51 +08:00
|
|
|
/// one of \' \" \? \\ \a \b \f \n \r \t \v
|
2011-08-11 12:06:15 +08:00
|
|
|
/// octal-escape-sequence:
|
|
|
|
/// \ octal-digit
|
|
|
|
/// \ octal-digit octal-digit
|
|
|
|
/// \ octal-digit octal-digit octal-digit
|
|
|
|
/// hexadecimal-escape-sequence:
|
|
|
|
/// \x hexadecimal-digit
|
|
|
|
/// hexadecimal-escape-sequence hexadecimal-digit
|
2007-03-14 06:37:02 +08:00
|
|
|
/// universal-character-name:
|
|
|
|
/// \u hex-quad
|
|
|
|
/// \U hex-quad hex-quad
|
|
|
|
/// hex-quad:
|
|
|
|
/// hex-digit hex-digit hex-digit hex-digit
|
2012-06-20 05:04:25 +08:00
|
|
|
/// \endverbatim
|
2007-04-05 14:57:15 +08:00
|
|
|
///
|
2007-03-14 06:37:02 +08:00
|
|
|
StringLiteralParser::
|
2014-06-26 12:58:39 +08:00
|
|
|
StringLiteralParser(ArrayRef<Token> StringToks,
|
2010-11-17 15:21:13 +08:00
|
|
|
Preprocessor &PP, bool Complain)
|
2012-03-11 15:00:24 +08:00
|
|
|
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
|
2014-05-18 07:10:59 +08:00
|
|
|
Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
|
2011-07-27 13:40:30 +08:00
|
|
|
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
|
|
|
|
ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
|
2014-06-26 12:58:39 +08:00
|
|
|
init(StringToks);
|
2010-11-17 15:21:13 +08:00
|
|
|
}
|
|
|
|
|
2014-06-26 12:58:39 +08:00
|
|
|
void StringLiteralParser::init(ArrayRef<Token> StringToks){
|
2011-05-18 06:09:56 +08:00
|
|
|
// The literal token may have come from an invalid source location (e.g. due
|
|
|
|
// to a PCH error), in which case the token length will be 0.
|
2014-06-26 12:58:39 +08:00
|
|
|
if (StringToks.empty() || StringToks[0].getLength() < 2)
|
2012-05-04 01:50:32 +08:00
|
|
|
return DiagnoseLexingError(SourceLocation());
|
2011-05-18 06:09:56 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Scan all of the string portions, remember the max individual token length,
|
|
|
|
// computing a bound on the concatenated string length, and see whether any
|
|
|
|
// piece is a wide-string. If any of the string portions is a wide-string
|
|
|
|
// literal, the result is a wide-string literal [C99 6.4.5p4].
|
2014-06-26 12:58:39 +08:00
|
|
|
assert(!StringToks.empty() && "expected at least one token");
|
2010-08-31 01:47:05 +08:00
|
|
|
MaxTokenLength = StringToks[0].getLength();
|
2011-05-18 06:09:56 +08:00
|
|
|
assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
|
2010-08-31 01:47:05 +08:00
|
|
|
SizeBound = StringToks[0].getLength()-2; // -2 for "".
|
2011-07-27 13:40:30 +08:00
|
|
|
Kind = StringToks[0].getKind();
|
2010-08-31 01:47:05 +08:00
|
|
|
|
|
|
|
hadError = false;
|
2007-04-05 14:57:15 +08:00
|
|
|
|
|
|
|
// Implement Translation Phase #6: concatenation of string literals
|
|
|
|
/// (C99 5.1.1.2p1). The common case is only one string fragment.
|
2014-06-26 12:58:39 +08:00
|
|
|
for (unsigned i = 1; i != StringToks.size(); ++i) {
|
2012-05-04 01:50:32 +08:00
|
|
|
if (StringToks[i].getLength() < 2)
|
|
|
|
return DiagnoseLexingError(StringToks[i].getLocation());
|
2011-05-18 06:09:56 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// The string could be shorter than this if it needs cleaning, but this is a
|
|
|
|
// reasonable bound, which is all we need.
|
2011-05-18 06:09:56 +08:00
|
|
|
assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
|
2010-08-31 01:47:05 +08:00
|
|
|
SizeBound += StringToks[i].getLength()-2; // -2 for "".
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Remember maximum string piece length.
|
2010-08-31 01:47:05 +08:00
|
|
|
if (StringToks[i].getLength() > MaxTokenLength)
|
|
|
|
MaxTokenLength = StringToks[i].getLength();
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
// Remember if we see any wide or utf-8/16/32 strings.
|
|
|
|
// Also check for illegal concatenations.
|
|
|
|
if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
|
|
|
|
if (isAscii()) {
|
|
|
|
Kind = StringToks[i].getKind();
|
|
|
|
} else {
|
|
|
|
if (Diags)
|
2012-09-08 15:16:20 +08:00
|
|
|
Diags->Report(StringToks[i].getLocation(),
|
2011-07-27 13:40:30 +08:00
|
|
|
diag::err_unsupported_string_concat);
|
|
|
|
hadError = true;
|
|
|
|
}
|
|
|
|
}
|
2007-03-14 06:37:02 +08:00
|
|
|
}
|
2009-02-27 07:01:51 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Include space for the null terminator.
|
|
|
|
++SizeBound;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// TODO: K&R warning: "traditional C rejects string constant concatenation"
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
// Get the width in bytes of char/wchar_t/char16_t/char32_t
|
|
|
|
CharByteWidth = getCharWidth(Kind, Target);
|
|
|
|
assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
|
|
|
|
CharByteWidth /= 8;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// The output buffer size needs to be large enough to hold wide characters.
|
|
|
|
// This is a worst-case assumption which basically corresponds to L"" "long".
|
2011-07-27 13:40:30 +08:00
|
|
|
SizeBound *= CharByteWidth;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Size the temporary buffer to hold the result string data.
|
|
|
|
ResultBuf.resize(SizeBound);
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Likewise, but for each string piece.
|
2012-02-05 10:13:05 +08:00
|
|
|
SmallString<512> TokenBuf;
|
2007-03-14 06:37:02 +08:00
|
|
|
TokenBuf.resize(MaxTokenLength);
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// Loop over all the strings, getting their spelling, and expanding them to
|
|
|
|
// wide strings as appropriate.
|
|
|
|
ResultPtr = &ResultBuf[0]; // Next byte to fill in.
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2007-10-15 10:50:23 +08:00
|
|
|
Pascal = false;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2012-03-05 12:02:15 +08:00
|
|
|
SourceLocation UDSuffixTokLoc;
|
|
|
|
|
2014-06-26 12:58:39 +08:00
|
|
|
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
|
2007-03-14 06:37:02 +08:00
|
|
|
const char *ThisTokBuf = &TokenBuf[0];
|
|
|
|
// Get the spelling of the token, which eliminates trigraphs, etc. We know
|
|
|
|
// that ThisTokBuf points to a buffer that is big enough for the whole token
|
|
|
|
// and 'spelled' tokens can only shrink.
|
2010-03-16 13:20:39 +08:00
|
|
|
bool StringInvalid = false;
|
2010-11-17 15:21:13 +08:00
|
|
|
unsigned ThisTokLen =
|
2010-11-17 15:26:20 +08:00
|
|
|
Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
|
|
|
|
&StringInvalid);
|
2012-05-04 01:50:32 +08:00
|
|
|
if (StringInvalid)
|
|
|
|
return DiagnoseLexingError(StringToks[i].getLocation());
|
2010-03-16 13:20:39 +08:00
|
|
|
|
2012-03-10 06:27:51 +08:00
|
|
|
const char *ThisTokBegin = ThisTokBuf;
|
2012-03-05 12:02:15 +08:00
|
|
|
const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
|
|
|
|
|
|
|
|
// Remove an optional ud-suffix.
|
|
|
|
if (ThisTokEnd[-1] != '"') {
|
|
|
|
const char *UDSuffixEnd = ThisTokEnd;
|
|
|
|
do {
|
|
|
|
--ThisTokEnd;
|
|
|
|
} while (ThisTokEnd[-1] != '"');
|
|
|
|
|
|
|
|
StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
|
|
|
|
|
|
|
|
if (UDSuffixBuf.empty()) {
|
2014-02-18 05:52:30 +08:00
|
|
|
if (StringToks[i].hasUCN())
|
|
|
|
expandUCNs(UDSuffixBuf, UDSuffix);
|
|
|
|
else
|
|
|
|
UDSuffixBuf.assign(UDSuffix);
|
2012-03-08 09:34:56 +08:00
|
|
|
UDSuffixToken = i;
|
|
|
|
UDSuffixOffset = ThisTokEnd - ThisTokBuf;
|
2012-03-05 12:02:15 +08:00
|
|
|
UDSuffixTokLoc = StringToks[i].getLocation();
|
2014-02-18 05:52:30 +08:00
|
|
|
} else {
|
|
|
|
SmallString<32> ExpandedUDSuffix;
|
|
|
|
if (StringToks[i].hasUCN()) {
|
|
|
|
expandUCNs(ExpandedUDSuffix, UDSuffix);
|
|
|
|
UDSuffix = ExpandedUDSuffix;
|
|
|
|
}
|
|
|
|
|
2012-03-05 12:02:15 +08:00
|
|
|
// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
|
|
|
|
// result of a concatenation involving at least one user-defined-string-
|
|
|
|
// literal, all the participating user-defined-string-literals shall
|
|
|
|
// have the same ud-suffix.
|
2014-03-09 13:18:27 +08:00
|
|
|
if (UDSuffixBuf != UDSuffix) {
|
2014-02-18 05:52:30 +08:00
|
|
|
if (Diags) {
|
|
|
|
SourceLocation TokLoc = StringToks[i].getLocation();
|
|
|
|
Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
|
|
|
|
<< UDSuffixBuf << UDSuffix
|
|
|
|
<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
|
|
|
|
<< SourceRange(TokLoc, TokLoc);
|
|
|
|
}
|
|
|
|
hadError = true;
|
2012-03-05 12:02:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Strip the end quote.
|
|
|
|
--ThisTokEnd;
|
|
|
|
|
2007-03-14 06:37:02 +08:00
|
|
|
// TODO: Input character set mapping support.
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-08-08 14:10:39 +08:00
|
|
|
// Skip marker for wide or unicode strings.
|
2011-07-27 13:40:30 +08:00
|
|
|
if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
|
2007-05-20 13:00:58 +08:00
|
|
|
++ThisTokBuf;
|
2011-07-27 13:40:30 +08:00
|
|
|
// Skip 8 of u8 marker for utf8 strings.
|
|
|
|
if (ThisTokBuf[0] == '8')
|
|
|
|
++ThisTokBuf;
|
2010-09-01 07:34:27 +08:00
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-08-11 12:06:15 +08:00
|
|
|
// Check for raw string
|
|
|
|
if (ThisTokBuf[0] == 'R') {
|
|
|
|
ThisTokBuf += 2; // skip R"
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-08-11 12:06:15 +08:00
|
|
|
const char *Prefix = ThisTokBuf;
|
|
|
|
while (ThisTokBuf[0] != '(')
|
2007-10-15 10:50:23 +08:00
|
|
|
++ThisTokBuf;
|
2011-08-11 12:06:15 +08:00
|
|
|
++ThisTokBuf; // skip '('
|
|
|
|
|
2012-03-09 05:59:28 +08:00
|
|
|
// Remove same number of characters from the end
|
|
|
|
ThisTokEnd -= ThisTokBuf - Prefix;
|
|
|
|
assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
|
2011-08-11 12:06:15 +08:00
|
|
|
|
2015-09-24 00:04:47 +08:00
|
|
|
// C++14 [lex.string]p4: A source-file new-line in a raw string literal
|
|
|
|
// results in a new-line in the resulting execution string-literal.
|
|
|
|
StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
|
|
|
|
while (!RemainingTokenSpan.empty()) {
|
|
|
|
// Split the string literal on \r\n boundaries.
|
|
|
|
size_t CRLFPos = RemainingTokenSpan.find("\r\n");
|
|
|
|
StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
|
|
|
|
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
|
|
|
|
|
|
|
|
// Copy everything before the \r\n sequence into the string literal.
|
|
|
|
if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
|
|
|
|
hadError = true;
|
|
|
|
|
|
|
|
// Point into the \n inside the \r\n sequence and operate on the
|
|
|
|
// remaining portion of the literal.
|
|
|
|
RemainingTokenSpan = AfterCRLF.substr(1);
|
|
|
|
}
|
2011-08-11 12:06:15 +08:00
|
|
|
} else {
|
2012-05-03 09:01:56 +08:00
|
|
|
if (ThisTokBuf[0] != '"') {
|
|
|
|
// The file may have come from PCH and then changed after loading the
|
|
|
|
// PCH; Fail gracefully.
|
2012-05-04 01:50:32 +08:00
|
|
|
return DiagnoseLexingError(StringToks[i].getLocation());
|
2012-05-03 09:01:56 +08:00
|
|
|
}
|
2011-08-11 12:06:15 +08:00
|
|
|
++ThisTokBuf; // skip "
|
|
|
|
|
|
|
|
// Check if this is a pascal string
|
|
|
|
if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
|
|
|
|
ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-08-11 12:06:15 +08:00
|
|
|
// If the \p sequence is found in the first token, we have a pascal string
|
|
|
|
// Otherwise, if we already have a pascal string, ignore the first \p
|
|
|
|
if (i == 0) {
|
2007-03-14 06:37:02 +08:00
|
|
|
++ThisTokBuf;
|
2011-08-11 12:06:15 +08:00
|
|
|
Pascal = true;
|
|
|
|
} else if (Pascal)
|
|
|
|
ThisTokBuf += 2;
|
2007-03-14 06:37:02 +08:00
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2011-08-11 12:06:15 +08:00
|
|
|
while (ThisTokBuf != ThisTokEnd) {
|
|
|
|
// Is this a span of non-escape characters?
|
|
|
|
if (ThisTokBuf[0] != '\\') {
|
|
|
|
const char *InStart = ThisTokBuf;
|
|
|
|
do {
|
|
|
|
++ThisTokBuf;
|
|
|
|
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
|
|
|
|
|
|
|
|
// Copy the character span over.
|
2012-09-08 15:16:20 +08:00
|
|
|
if (CopyStringFragment(StringToks[i], ThisTokBegin,
|
|
|
|
StringRef(InStart, ThisTokBuf - InStart)))
|
|
|
|
hadError = true;
|
2011-08-11 12:06:15 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Is this a Universal Character Name escape?
|
|
|
|
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
|
2012-03-10 06:27:51 +08:00
|
|
|
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
|
|
|
|
ResultPtr, hadError,
|
|
|
|
FullSourceLoc(StringToks[i].getLocation(), SM),
|
2011-08-11 12:06:15 +08:00
|
|
|
CharByteWidth, Diags, Features);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Otherwise, this is a non-UCN escape character. Process it.
|
|
|
|
unsigned ResultChar =
|
2012-09-08 15:16:20 +08:00
|
|
|
ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
|
2011-08-11 12:06:15 +08:00
|
|
|
FullSourceLoc(StringToks[i].getLocation(), SM),
|
2012-09-08 15:16:20 +08:00
|
|
|
CharByteWidth*8, Diags, Features);
|
2011-08-11 12:06:15 +08:00
|
|
|
|
2011-11-03 07:06:23 +08:00
|
|
|
if (CharByteWidth == 4) {
|
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
|
2011-11-14 13:17:37 +08:00
|
|
|
*ResultWidePtr = ResultChar;
|
2011-11-03 07:06:23 +08:00
|
|
|
ResultPtr += 4;
|
|
|
|
} else if (CharByteWidth == 2) {
|
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
|
2011-11-14 13:17:37 +08:00
|
|
|
*ResultWidePtr = ResultChar & 0xFFFF;
|
2011-11-03 07:06:23 +08:00
|
|
|
ResultPtr += 2;
|
|
|
|
} else {
|
|
|
|
assert(CharByteWidth == 1 && "Unexpected char width");
|
|
|
|
*ResultPtr++ = ResultChar & 0xFF;
|
|
|
|
}
|
2011-08-11 12:06:15 +08:00
|
|
|
}
|
2007-03-14 06:37:02 +08:00
|
|
|
}
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-01-17 02:51:42 +08:00
|
|
|
if (Pascal) {
|
2011-11-05 08:41:04 +08:00
|
|
|
if (CharByteWidth == 4) {
|
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
|
|
|
|
ResultWidePtr[0] = GetNumStringChars() - 1;
|
|
|
|
} else if (CharByteWidth == 2) {
|
|
|
|
// FIXME: Make the type of the result buffer correct instead of
|
|
|
|
// using reinterpret_cast.
|
|
|
|
UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
|
|
|
|
ResultWidePtr[0] = GetNumStringChars() - 1;
|
|
|
|
} else {
|
|
|
|
assert(CharByteWidth == 1 && "Unexpected char width");
|
|
|
|
ResultBuf[0] = GetNumStringChars() - 1;
|
|
|
|
}
|
2009-01-17 02:51:42 +08:00
|
|
|
|
|
|
|
// Verify that pascal strings aren't too large.
|
2010-11-17 15:21:13 +08:00
|
|
|
if (GetStringLength() > 256) {
|
2012-09-08 15:16:20 +08:00
|
|
|
if (Diags)
|
2014-06-26 12:58:39 +08:00
|
|
|
Diags->Report(StringToks.front().getLocation(),
|
2010-11-17 15:21:13 +08:00
|
|
|
diag::err_pascal_string_too_long)
|
2014-06-26 12:58:39 +08:00
|
|
|
<< SourceRange(StringToks.front().getLocation(),
|
|
|
|
StringToks.back().getLocation());
|
2011-07-27 13:40:30 +08:00
|
|
|
hadError = true;
|
2009-04-01 11:17:08 +08:00
|
|
|
return;
|
|
|
|
}
|
2010-11-17 15:21:13 +08:00
|
|
|
} else if (Diags) {
|
2010-07-20 22:33:20 +08:00
|
|
|
// Complain if this string literal has too many characters.
|
2010-11-17 15:12:42 +08:00
|
|
|
unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
|
2012-11-09 03:22:26 +08:00
|
|
|
|
2010-07-20 22:33:20 +08:00
|
|
|
if (GetNumStringChars() > MaxChars)
|
2014-06-26 12:58:39 +08:00
|
|
|
Diags->Report(StringToks.front().getLocation(),
|
2010-11-17 15:21:13 +08:00
|
|
|
diag::ext_string_too_long)
|
2010-07-20 22:33:20 +08:00
|
|
|
<< GetNumStringChars() << MaxChars
|
2010-11-17 15:12:42 +08:00
|
|
|
<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
|
2014-06-26 12:58:39 +08:00
|
|
|
<< SourceRange(StringToks.front().getLocation(),
|
|
|
|
StringToks.back().getLocation());
|
2009-01-17 02:51:42 +08:00
|
|
|
}
|
2007-03-14 06:37:02 +08:00
|
|
|
}
|
2009-02-19 03:21:10 +08:00
|
|
|
|
2012-11-09 03:22:26 +08:00
|
|
|
static const char *resyncUTF8(const char *Err, const char *End) {
|
|
|
|
if (Err == End)
|
|
|
|
return End;
|
|
|
|
End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
|
|
|
|
while (++Err != End && (*Err & 0xC0) == 0x80)
|
|
|
|
;
|
|
|
|
return Err;
|
2012-10-29 02:24:46 +08:00
|
|
|
}
|
|
|
|
|
2012-09-08 15:16:20 +08:00
|
|
|
/// \brief This function copies from Fragment, which is a sequence of bytes
|
|
|
|
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
|
2011-08-11 12:06:15 +08:00
|
|
|
/// Performs widening for multi-byte characters.
|
2012-09-08 15:16:20 +08:00
|
|
|
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
|
|
|
|
const char *TokBegin,
|
|
|
|
StringRef Fragment) {
|
|
|
|
const UTF8 *ErrorPtrTmp;
|
|
|
|
if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
|
|
|
|
return false;
|
2011-08-11 12:06:15 +08:00
|
|
|
|
2012-02-11 13:08:10 +08:00
|
|
|
// If we see bad encoding for unprefixed string literals, warn and
|
|
|
|
// simply copy the byte values, for compatibility with gcc and older
|
|
|
|
// versions of clang.
|
|
|
|
bool NoErrorOnBadEncoding = isAscii();
|
2012-09-08 15:16:20 +08:00
|
|
|
if (NoErrorOnBadEncoding) {
|
|
|
|
memcpy(ResultPtr, Fragment.data(), Fragment.size());
|
|
|
|
ResultPtr += Fragment.size();
|
|
|
|
}
|
2012-10-29 02:24:46 +08:00
|
|
|
|
2012-09-08 15:16:20 +08:00
|
|
|
if (Diags) {
|
2012-10-29 02:24:46 +08:00
|
|
|
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
|
|
|
|
|
|
|
|
FullSourceLoc SourceLoc(Tok.getLocation(), SM);
|
|
|
|
const DiagnosticBuilder &Builder =
|
|
|
|
Diag(Diags, Features, SourceLoc, TokBegin,
|
2012-11-09 03:22:26 +08:00
|
|
|
ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
|
2012-10-29 02:24:46 +08:00
|
|
|
NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
|
|
|
|
: diag::err_bad_string_encoding);
|
|
|
|
|
2012-11-09 03:22:26 +08:00
|
|
|
const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
|
2012-10-29 02:24:46 +08:00
|
|
|
StringRef NextFragment(NextStart, Fragment.end()-NextStart);
|
|
|
|
|
2012-11-09 03:22:31 +08:00
|
|
|
// Decode into a dummy buffer.
|
|
|
|
SmallString<512> Dummy;
|
|
|
|
Dummy.reserve(Fragment.size() * CharByteWidth);
|
|
|
|
char *Ptr = Dummy.data();
|
|
|
|
|
2014-05-23 03:56:11 +08:00
|
|
|
while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
|
2012-10-29 02:24:46 +08:00
|
|
|
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
|
2012-11-09 03:22:26 +08:00
|
|
|
NextStart = resyncUTF8(ErrorPtr, Fragment.end());
|
2012-10-29 02:24:46 +08:00
|
|
|
Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
|
|
|
|
ErrorPtr, NextStart);
|
|
|
|
NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
|
|
|
|
}
|
2012-09-08 15:16:20 +08:00
|
|
|
}
|
2012-02-11 13:08:10 +08:00
|
|
|
return !NoErrorOnBadEncoding;
|
|
|
|
}
|
2011-08-11 12:06:15 +08:00
|
|
|
|
2012-05-04 01:50:32 +08:00
|
|
|
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
|
|
|
|
hadError = true;
|
|
|
|
if (Diags)
|
|
|
|
Diags->Report(Loc, diag::err_lexing_string);
|
|
|
|
}
|
|
|
|
|
2009-02-19 03:21:10 +08:00
|
|
|
/// getOffsetOfStringByte - This function returns the offset of the
|
|
|
|
/// specified byte of the string data represented by Token. This handles
|
|
|
|
/// advancing over escape sequences in the string.
|
|
|
|
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
|
2010-11-17 14:46:14 +08:00
|
|
|
unsigned ByteNo) const {
|
2009-02-19 03:21:10 +08:00
|
|
|
// Get the spelling of the token.
|
2012-02-05 10:13:05 +08:00
|
|
|
SmallString<32> SpellingBuffer;
|
2010-08-31 01:47:05 +08:00
|
|
|
SpellingBuffer.resize(Tok.getLength());
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2010-03-16 13:20:39 +08:00
|
|
|
bool StringInvalid = false;
|
2009-02-19 03:21:10 +08:00
|
|
|
const char *SpellingPtr = &SpellingBuffer[0];
|
2010-11-17 15:26:20 +08:00
|
|
|
unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
|
|
|
|
&StringInvalid);
|
2010-11-17 14:26:08 +08:00
|
|
|
if (StringInvalid)
|
2010-03-16 13:20:39 +08:00
|
|
|
return 0;
|
2009-02-19 03:21:10 +08:00
|
|
|
|
2012-06-13 13:37:23 +08:00
|
|
|
const char *SpellingStart = SpellingPtr;
|
|
|
|
const char *SpellingEnd = SpellingPtr+TokLen;
|
|
|
|
|
|
|
|
// Handle UTF-8 strings just like narrow strings.
|
|
|
|
if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
|
|
|
|
SpellingPtr += 2;
|
|
|
|
|
2011-07-27 13:40:30 +08:00
|
|
|
assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
|
|
|
|
SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
|
2009-02-19 03:21:10 +08:00
|
|
|
|
2012-06-13 13:37:23 +08:00
|
|
|
// For raw string literals, this is easy.
|
|
|
|
if (SpellingPtr[0] == 'R') {
|
|
|
|
assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
|
|
|
|
// Skip 'R"'.
|
|
|
|
SpellingPtr += 2;
|
|
|
|
while (*SpellingPtr != '(') {
|
|
|
|
++SpellingPtr;
|
|
|
|
assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
|
|
|
|
}
|
|
|
|
// Skip '('.
|
|
|
|
++SpellingPtr;
|
|
|
|
return SpellingPtr - SpellingStart + ByteNo;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2012-06-13 13:37:23 +08:00
|
|
|
// Skip over the leading quote
|
2009-02-19 03:21:10 +08:00
|
|
|
assert(SpellingPtr[0] == '"' && "Should be a string literal!");
|
|
|
|
++SpellingPtr;
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-02-19 03:21:10 +08:00
|
|
|
// Skip over bytes until we find the offset we're looking for.
|
|
|
|
while (ByteNo) {
|
|
|
|
assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-02-19 03:21:10 +08:00
|
|
|
// Step over non-escapes simply.
|
|
|
|
if (*SpellingPtr != '\\') {
|
|
|
|
++SpellingPtr;
|
|
|
|
--ByteNo;
|
|
|
|
continue;
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-02-19 03:21:10 +08:00
|
|
|
// Otherwise, this is an escape character. Advance over it.
|
|
|
|
bool HadError = false;
|
2012-06-13 13:37:23 +08:00
|
|
|
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
|
|
|
|
const char *EscapePtr = SpellingPtr;
|
|
|
|
unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
|
|
|
|
1, Features, HadError);
|
|
|
|
if (Len > ByteNo) {
|
|
|
|
// ByteNo is somewhere within the escape sequence.
|
|
|
|
SpellingPtr = EscapePtr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ByteNo -= Len;
|
|
|
|
} else {
|
2012-09-08 15:16:20 +08:00
|
|
|
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
|
2012-06-13 13:37:23 +08:00
|
|
|
FullSourceLoc(Tok.getLocation(), SM),
|
2012-09-08 15:16:20 +08:00
|
|
|
CharByteWidth*8, Diags, Features);
|
2012-06-13 13:37:23 +08:00
|
|
|
--ByteNo;
|
|
|
|
}
|
2009-02-19 03:21:10 +08:00
|
|
|
assert(!HadError && "This method isn't valid on erroneous strings");
|
|
|
|
}
|
2009-09-09 23:08:12 +08:00
|
|
|
|
2009-02-19 03:21:10 +08:00
|
|
|
return SpellingPtr-SpellingStart;
|
|
|
|
}
|