forked from OSchip/llvm-project
[ms] [llvm-ml] Lex MASM strings, including escaping
Allow single-quoted strings and double-quoted character values, as well as doubled-quote escaping. Reviewed By: thakis Differential Revision: https://reviews.llvm.org/D89731
This commit is contained in:
parent
c126eb7529
commit
07c4f1d10b
|
@ -56,6 +56,7 @@ private:
|
|||
bool isAtStartOfComment(const char *Ptr);
|
||||
bool isAtStatementSeparator(const char *Ptr);
|
||||
int getNextChar();
|
||||
int peekNextChar();
|
||||
AsmToken ReturnError(const char *Loc, const std::string &Msg);
|
||||
|
||||
AsmToken LexIdentifier();
|
||||
|
|
|
@ -51,6 +51,7 @@ protected: // Can only create subclasses.
|
|||
bool IsAtStartOfStatement = true;
|
||||
bool LexMasmHexFloats = false;
|
||||
bool LexMasmIntegers = false;
|
||||
bool LexMasmStrings = false;
|
||||
bool UseMasmDefaultRadix = false;
|
||||
unsigned DefaultRadix = 10;
|
||||
AsmCommentConsumer *CommentConsumer = nullptr;
|
||||
|
@ -163,6 +164,10 @@ public:
|
|||
|
||||
/// Set whether to lex masm-style hex float literals, such as 3f800000r.
|
||||
void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; }
|
||||
|
||||
/// Set whether to lex masm-style string literals, such as 'Can''t find file'
|
||||
/// and "This ""value"" not found".
|
||||
void setLexMasmStrings(bool V) { LexMasmStrings = V; }
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -64,6 +64,12 @@ int AsmLexer::getNextChar() {
|
|||
return (unsigned char)*CurPtr++;
|
||||
}
|
||||
|
||||
int AsmLexer::peekNextChar() {
|
||||
if (CurPtr == CurBuf.end())
|
||||
return EOF;
|
||||
return (unsigned char)*CurPtr;
|
||||
}
|
||||
|
||||
/// The leading integral digit sequence and dot should have already been
|
||||
/// consumed, some or all of the fractional digit sequence *can* have been
|
||||
/// consumed.
|
||||
|
@ -521,6 +527,24 @@ AsmToken AsmLexer::LexDigit() {
|
|||
AsmToken AsmLexer::LexSingleQuote() {
|
||||
int CurChar = getNextChar();
|
||||
|
||||
if (LexMasmStrings) {
|
||||
while (CurChar != EOF) {
|
||||
if (CurChar != '\'') {
|
||||
CurChar = getNextChar();
|
||||
} else if (peekNextChar() == '\'') {
|
||||
// In MASM single-quote strings, doubled single-quotes mean an escaped
|
||||
// single quote, so should be lexed in.
|
||||
getNextChar();
|
||||
CurChar = getNextChar();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (CurChar == EOF)
|
||||
return ReturnError(TokStart, "unterminated string constant");
|
||||
return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
|
||||
}
|
||||
|
||||
if (CurChar == '\\')
|
||||
CurChar = getNextChar();
|
||||
|
||||
|
@ -555,6 +579,24 @@ AsmToken AsmLexer::LexSingleQuote() {
|
|||
/// LexQuote: String: "..."
|
||||
AsmToken AsmLexer::LexQuote() {
|
||||
int CurChar = getNextChar();
|
||||
if (LexMasmStrings) {
|
||||
while (CurChar != EOF) {
|
||||
if (CurChar != '"') {
|
||||
CurChar = getNextChar();
|
||||
} else if (peekNextChar() == '"') {
|
||||
// In MASM double-quoted strings, doubled double-quotes mean an escaped
|
||||
// double quote, so should be lexed in.
|
||||
getNextChar();
|
||||
CurChar = getNextChar();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (CurChar == EOF)
|
||||
return ReturnError(TokStart, "unterminated string constant");
|
||||
return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
|
||||
}
|
||||
|
||||
// TODO: does gas allow multiline string constants?
|
||||
while (CurChar != '"') {
|
||||
if (CurChar == '\\') {
|
||||
|
|
|
@ -3086,70 +3086,19 @@ bool MasmParser::parseEscapedString(std::string &Data) {
|
|||
return true;
|
||||
|
||||
Data = "";
|
||||
char Quote = getTok().getString().front();
|
||||
StringRef Str = getTok().getStringContents();
|
||||
for (unsigned i = 0, e = Str.size(); i != e; ++i) {
|
||||
if (Str[i] != '\\') {
|
||||
Data += Str[i];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Recognize escaped characters. Note that this escape semantics currently
|
||||
// loosely follows Darwin 'as'.
|
||||
++i;
|
||||
if (i == e)
|
||||
return TokError("unexpected backslash at end of string");
|
||||
|
||||
// Recognize hex sequences similarly to GNU 'as'.
|
||||
if (Str[i] == 'x' || Str[i] == 'X') {
|
||||
size_t length = Str.size();
|
||||
if (i + 1 >= length || !isHexDigit(Str[i + 1]))
|
||||
return TokError("invalid hexadecimal escape sequence");
|
||||
|
||||
// Consume hex characters. GNU 'as' reads all hexadecimal characters and
|
||||
// then truncates to the lower 16 bits. Seems reasonable.
|
||||
unsigned Value = 0;
|
||||
while (i + 1 < length && isHexDigit(Str[i + 1]))
|
||||
Value = Value * 16 + hexDigitValue(Str[++i]);
|
||||
|
||||
Data += (unsigned char)(Value & 0xFF);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Recognize octal sequences.
|
||||
if ((unsigned)(Str[i] - '0') <= 7) {
|
||||
// Consume up to three octal characters.
|
||||
unsigned Value = Str[i] - '0';
|
||||
|
||||
if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
|
||||
Data.reserve(Str.size());
|
||||
for (int i = 0, e = Str.size(); i != e; ++i) {
|
||||
Data.push_back(Str[i]);
|
||||
if (Str[i] == Quote) {
|
||||
// MASM treats doubled delimiting quotes as an escaped delimiting quote.
|
||||
// If we're escaping the string's trailing delimiter, we're definitely
|
||||
// missing a quotation mark.
|
||||
if (i + 1 == Str.size())
|
||||
return Error(getTok().getLoc(), "missing quotation mark in string");
|
||||
if (Str[i + 1] == Quote)
|
||||
++i;
|
||||
Value = Value * 8 + (Str[i] - '0');
|
||||
|
||||
if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
|
||||
++i;
|
||||
Value = Value * 8 + (Str[i] - '0');
|
||||
}
|
||||
}
|
||||
|
||||
if (Value > 255)
|
||||
return TokError("invalid octal escape sequence (out of range)");
|
||||
|
||||
Data += (unsigned char)Value;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise recognize individual escapes.
|
||||
switch (Str[i]) {
|
||||
default:
|
||||
// Just reject invalid escape sequences for now.
|
||||
return TokError("invalid escape sequence (unrecognized character)");
|
||||
|
||||
case 'b': Data += '\b'; break;
|
||||
case 'f': Data += '\f'; break;
|
||||
case 'n': Data += '\n'; break;
|
||||
case 'r': Data += '\r'; break;
|
||||
case 't': Data += '\t'; break;
|
||||
case '"': Data += '"'; break;
|
||||
case '\\': Data += '\\'; break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3220,7 +3169,9 @@ bool MasmParser::parseScalarInitializer(unsigned Size,
|
|||
SmallVectorImpl<const MCExpr *> &Values,
|
||||
unsigned StringPadLength) {
|
||||
if (getTok().is(AsmToken::String)) {
|
||||
StringRef Value = getTok().getStringContents();
|
||||
std::string Value;
|
||||
if (parseEscapedString(Value))
|
||||
return true;
|
||||
if (Size == 1) {
|
||||
// Treat each character as an initializer.
|
||||
for (const char CharVal : Value)
|
||||
|
@ -3235,11 +3186,10 @@ bool MasmParser::parseScalarInitializer(unsigned Size,
|
|||
return Error(getTok().getLoc(), "out of range literal value");
|
||||
|
||||
uint64_t IntValue = 0;
|
||||
for (const unsigned char CharVal : Value.bytes())
|
||||
for (const unsigned char CharVal : Value)
|
||||
IntValue = (IntValue << 8) | CharVal;
|
||||
Values.push_back(MCConstantExpr::create(IntValue, getContext()));
|
||||
}
|
||||
Lex();
|
||||
} else {
|
||||
const MCExpr *Value;
|
||||
if (parseExpression(Value))
|
||||
|
|
|
@ -1696,6 +1696,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
|
|||
case AsmToken::At:
|
||||
case AsmToken::String:
|
||||
case AsmToken::Identifier: {
|
||||
if (Parser.isParsingMasm() && Tok.is(AsmToken::String)) {
|
||||
// Single-character strings should be treated as integer constants. This
|
||||
// includes MASM escapes for quotes.
|
||||
char Quote = Tok.getString().front();
|
||||
StringRef Contents = Tok.getStringContents();
|
||||
if (Contents.size() == 1 || Contents == std::string(2, Quote)) {
|
||||
if (SM.onInteger(Contents.front(), ErrMsg))
|
||||
return Error(Tok.getLoc(), ErrMsg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
SMLoc IdentLoc = Tok.getLoc();
|
||||
StringRef Identifier = Tok.getString();
|
||||
UpdateLocLex = false;
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
# RUN: llvm-ml -filetype=asm %s | FileCheck %s
|
||||
|
||||
.data
|
||||
|
||||
dq_single_character BYTE "a"
|
||||
; CHECK-LABEL: dq_single_character:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
dq_join BYTE "ab", "cd"
|
||||
; CHECK-LABEL: dq_join:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
dq_quote_escape BYTE "ab""""cd"
|
||||
; Intended result: ab""cd
|
||||
; CHECK-LABEL: dq_quote_escape:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
dq_single_quote BYTE "ab''''cd"
|
||||
; Intended result: ab''''cd
|
||||
; CHECK-LABEL: dq_single_quote:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
sq_single_character BYTE 'a'
|
||||
; CHECK-LABEL: sq_single_character:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
sq_join BYTE 'ab', 'cd'
|
||||
; CHECK-LABEL: sq_join:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
sq_quote_escape BYTE 'ab''''cd'
|
||||
; Intended result: ab''cd
|
||||
; CHECK-LABEL: sq_quote_escape:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
sq_double_quote BYTE 'ab""""cd'
|
||||
; Intended result: ab""""cd
|
||||
; CHECK-LABEL: sq_double_quote:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
mixed_quotes_join BYTE "a'b", 'c"d'
|
||||
; Intended result: a'bc"d
|
||||
; CHECK-LABEL: mixed_quotes_join:
|
||||
; CHECK-NEXT: .byte 97
|
||||
; CHECK-NEXT: .byte 39
|
||||
; CHECK-NEXT: .byte 98
|
||||
; CHECK-NEXT: .byte 99
|
||||
; CHECK-NEXT: .byte 34
|
||||
; CHECK-NEXT: .byte 100
|
||||
; CHECK-NOT: .byte
|
||||
|
||||
.code
|
||||
|
||||
sq_char_test PROC
|
||||
; CHECK-LABEL: sq_char_test:
|
||||
|
||||
mov eax, 'a'
|
||||
; CHECK: mov eax, 97
|
||||
|
||||
mov eax, ''''
|
||||
; CHECK: mov eax, 39
|
||||
|
||||
mov eax, '"'
|
||||
; CHECK: mov eax, 34
|
||||
|
||||
ret
|
||||
sq_char_test ENDP
|
||||
|
||||
dq_char_test PROC
|
||||
; CHECK-LABEL: dq_char_test:
|
||||
|
||||
mov eax, "b"
|
||||
; CHECK: mov eax, 98
|
||||
|
||||
mov eax, """"
|
||||
; CHECK: mov eax, 34
|
||||
|
||||
mov eax, "'"
|
||||
; CHECK: mov eax, 39
|
||||
|
||||
ret
|
||||
dq_char_test ENDP
|
||||
|
||||
end
|
|
@ -46,7 +46,7 @@ t1 foobar <>
|
|||
; CHECK-NEXT: .byte 101
|
||||
; CHECK-NEXT: .zero 1
|
||||
|
||||
t2 FOOBAR <"gh",,<10,11>,<12>,"ijk">
|
||||
t2 FOOBAR <"gh",,<10,11>,<12>,'ijk'>
|
||||
|
||||
; CHECK: t2:
|
||||
;
|
||||
|
|
|
@ -184,6 +184,7 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, raw_ostream &OS) {
|
|||
Lexer.setLexMasmIntegers(true);
|
||||
Lexer.useMasmDefaultRadix(true);
|
||||
Lexer.setLexMasmHexFloats(true);
|
||||
Lexer.setLexMasmStrings(true);
|
||||
|
||||
bool Error = false;
|
||||
while (Lexer.Lex().isNot(AsmToken::Eof)) {
|
||||
|
@ -216,6 +217,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
|
|||
Parser->getLexer().setLexMasmIntegers(true);
|
||||
Parser->getLexer().useMasmDefaultRadix(true);
|
||||
Parser->getLexer().setLexMasmHexFloats(true);
|
||||
Parser->getLexer().setLexMasmStrings(true);
|
||||
|
||||
int Res = Parser->Run(/*NoInitialTextSection=*/true);
|
||||
|
||||
|
|
Loading…
Reference in New Issue