[Sema] Handle UTF-8 invalid format string specifiers

Improve invalid format string specifier handling by printing out
invalid specifiers characters with \x, \u and \U. Previously clang
would print gargabe whenever the character is unprintable.

Example, before:
  NSLog(@"%\u25B9"); => warning: invalid conversion specifier ' [-Wformat-invalid-specifier]
after:
  NSLog(@"%\u25B9"); => warning: invalid conversion specifier '\u25b9' [-Wformat-invalid-specifier]

Differential Revision: http://reviews.llvm.org/D18296

rdar://problem/24672159

llvm-svn: 264752
This commit is contained in:
Bruno Cardoso Lopes 2016-03-29 17:35:02 +00:00
parent ac400900da
commit 0c18d03d91
9 changed files with 106 additions and 18 deletions

View File

@ -210,6 +210,7 @@ public:
unsigned getLength() const {
return EndScanList ? EndScanList - Position : 1;
}
void setEndScanList(const char *pos) { EndScanList = pos; }
bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) ||
kind == FreeBSDrArg || kind == FreeBSDyArg; }
@ -413,11 +414,6 @@ public:
bool isObjCArg() const { return kind >= ObjCBeg && kind <= ObjCEnd; }
bool isDoubleArg() const { return kind >= DoubleArgBeg &&
kind <= DoubleArgEnd; }
unsigned getLength() const {
// Conversion specifiers currently only are represented by
// single characters, but we be flexible.
return 1;
}
static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
return CS->isPrintfKind();
@ -546,8 +542,6 @@ public:
ScanfConversionSpecifier(const char *pos, Kind k)
: ConversionSpecifier(false, pos, k) {}
void setEndScanList(const char *pos) { EndScanList = pos; }
static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
return !CS->isPrintfKind();
}

View File

@ -15,6 +15,7 @@
#include "FormatStringParsing.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/TargetInfo.h"
#include "llvm/Support/ConvertUTF.h"
using clang::analyze_format_string::ArgType;
using clang::analyze_format_string::FormatStringHandler;
@ -260,6 +261,28 @@ clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
return true;
}
bool clang::analyze_format_string::ParseUTF8InvalidSpecifier(
const char *SpecifierBegin, const char *FmtStrEnd, unsigned &Len) {
if (SpecifierBegin + 1 >= FmtStrEnd)
return false;
const UTF8 *SB = reinterpret_cast<const UTF8 *>(SpecifierBegin + 1);
const UTF8 *SE = reinterpret_cast<const UTF8 *>(FmtStrEnd);
const char FirstByte = *SB;
// If the invalid specifier is a multibyte UTF-8 string, return the
// total length accordingly so that the conversion specifier can be
// properly updated to reflect a complete UTF-8 specifier.
unsigned NumBytes = getNumBytesForUTF8(FirstByte);
if (NumBytes == 1)
return false;
if (SB + NumBytes > SE)
return false;
Len = NumBytes + 1;
return true;
}
//===----------------------------------------------------------------------===//
// Methods on ArgType.
//===----------------------------------------------------------------------===//

View File

@ -46,7 +46,13 @@ bool ParseArgPosition(FormatStringHandler &H,
/// FormatSpecifier& argument, and false otherwise.
bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E,
const LangOptions &LO, bool IsScanf = false);
/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8
/// string; check that it won't go further than \p FmtStrEnd and write
/// up the total size in \p Len.
bool ParseUTF8InvalidSpecifier(const char *SpecifierBegin,
const char *FmtStrEnd, unsigned &Len);
template <typename T> class SpecifierResult {
T FS;
const char *Start;

View File

@ -312,8 +312,13 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
argIndex++;
if (k == ConversionSpecifier::InvalidSpecifier) {
unsigned Len = I - Start;
if (ParseUTF8InvalidSpecifier(Start, E, Len)) {
CS.setEndScanList(Start + Len);
FS.setConversionSpecifier(CS);
}
// Assume the conversion takes one argument.
return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start);
return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len);
}
return PrintfSpecifierResult(Start, FS);
}

View File

@ -79,7 +79,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
unsigned &argIndex,
const LangOptions &LO,
const TargetInfo &Target) {
using namespace clang::analyze_format_string;
using namespace clang::analyze_scanf;
const char *I = Beg;
const char *Start = nullptr;
@ -210,10 +210,15 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
// FIXME: '%' and '*' doesn't make sense. Issue a warning.
// FIXME: 'ConsumedSoFar' and '*' doesn't make sense.
if (k == ScanfConversionSpecifier::InvalidSpecifier) {
unsigned Len = I - Beg;
if (ParseUTF8InvalidSpecifier(Beg, E, Len)) {
CS.setEndScanList(Beg + Len);
FS.setConversionSpecifier(CS);
}
// Assume the conversion takes one argument.
return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg);
return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len);
}
return ScanfSpecifierResult(Start, FS);
}

View File

@ -36,6 +36,8 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/Locale.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/raw_ostream.h"
#include <limits>
@ -3976,12 +3978,41 @@ CheckFormatHandler::HandleInvalidConversionSpecifier(unsigned argIndex,
// gibberish when trying to match arguments.
keepGoing = false;
}
EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion)
<< StringRef(csStart, csLen),
Loc, /*IsStringLocation*/true,
getSpecifierRange(startSpec, specifierLen));
StringRef Specifier(csStart, csLen);
// If the specifier in non-printable, it could be the first byte of a UTF-8
// sequence. In that case, print the UTF-8 code point. If not, print the byte
// hex value.
std::string CodePointStr;
if (!llvm::sys::locale::isPrint(*csStart)) {
UTF32 CodePoint;
const UTF8 **B = reinterpret_cast<const UTF8 **>(&csStart);
const UTF8 *E =
reinterpret_cast<const UTF8 *>(csStart + csLen);
ConversionResult Result =
llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion);
if (Result != conversionOK) {
unsigned char FirstChar = *csStart;
CodePoint = (UTF32)FirstChar;
}
llvm::raw_string_ostream OS(CodePointStr);
if (CodePoint < 256)
OS << "\\x" << llvm::format("%02x", CodePoint);
else if (CodePoint <= 0xFFFF)
OS << "\\u" << llvm::format("%04x", CodePoint);
else
OS << "\\U" << llvm::format("%08x", CodePoint);
OS.flush();
Specifier = CodePointStr;
}
EmitFormatDiagnostic(
S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc,
/*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen));
return keepGoing;
}

View File

@ -183,3 +183,11 @@ void check_conditional_literal(char *s, int *i) {
scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}}
scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}}
}
void testInvalidNoPrintable(int *a) {
scanf("%\u25B9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
scanf("%\xE2\x96\xB9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
scanf("%\U00010348", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
scanf("%\xF0\x90\x8D\x88", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
scanf("%\xe2", a); // expected-warning {{invalid conversion specifier '\xe2'}}
}

View File

@ -642,6 +642,14 @@ void test_qualifiers(volatile int *vip, const int *cip,
printf("%n", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}}
}
void testInvalidNoPrintable() {
printf("%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
printf("%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
printf("%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
printf("%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
printf("%\xe2"); // expected-warning {{invalid conversion specifier '\xe2'}}
}
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
#pragma GCC diagnostic warning "-Wformat-security"
// <rdar://problem/14178260>

View File

@ -265,3 +265,11 @@ void testObjCModifierFlags() {
NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}}
}
// Test Objective-C invalid no printable specifiers
void testObjcInvalidNoPrintable(int *a) {
NSLog(@"%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
NSLog(@"%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
NSLog(@"%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
NSLog(@"%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
NSLog(@"%\xe2"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}}
}