[libc][NFC] add index mode to printf parser

This patch is a followup to the previous patch which implemented the
main printf parsing logic as well as sequential mode. This patch adds
index mode.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D123424
This commit is contained in:
Michael Jones 2022-04-07 14:22:48 -07:00
parent 7e63a0d479
commit 945fa672c6
4 changed files with 487 additions and 8 deletions

View File

@ -17,7 +17,7 @@ add_object_library(
libc.src.__support.ctype_utils
libc.src.__support.str_to_integer
libc.src.__support.CPP.bit
libc.src.string.memory_utils.memset_implementation
)
add_header_library(

View File

@ -6,6 +6,8 @@
//
//===----------------------------------------------------------------------===//
// #define LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
#include "parser.h"
#include "src/__support/arg_list.h"
@ -17,8 +19,6 @@
namespace __llvm_libc {
namespace printf_core {
#define LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE 1 // This will be a compile flag.
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
#else
@ -36,6 +36,10 @@ FormatSection Parser::get_next_section() {
++cur_pos;
[[maybe_unused]] size_t conv_index = 0;
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
conv_index = parse_index(&cur_pos);
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
section.flags = parse_flags(&cur_pos);
// handle width
@ -192,7 +196,7 @@ LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
return LengthModifier::l;
}
case ('h'):
if (str[cur_pos + 1] == 'h') {
if (str[*local_pos + 1] == 'h') {
*local_pos += 2;
return LengthModifier::hh;
} else {
@ -216,5 +220,190 @@ LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
}
}
//----------------------------------------------------
// INDEX MODE ONLY FUNCTIONS AFTER HERE:
//----------------------------------------------------
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
size_t Parser::parse_index(size_t *local_pos) {
if (internal::isdigit(str[*local_pos])) {
char *int_end;
size_t index =
internal::strtointeger<size_t>(str + *local_pos, &int_end, 10);
if (int_end[0] != '$')
return 0;
*local_pos = 1 + int_end - str;
return index;
}
return 0;
}
Parser::TypeDesc Parser::get_type_desc(size_t index) {
// index mode is assumed, and the indicies start at 1, so an index
// of 0 is invalid.
size_t local_pos = 0;
while (str[local_pos]) {
if (str[local_pos] == '%') {
++local_pos;
size_t conv_index = parse_index(&local_pos);
// the flags aren't relevant for this situation, but I need to skip past
// them so they're parsed but the result is discarded.
parse_flags(&local_pos);
// handle width
if (str[local_pos] == '*') {
++local_pos;
size_t width_index = parse_index(&local_pos);
set_type_desc(width_index, TYPE_DESC<int>);
if (width_index == index)
return TYPE_DESC<int>;
} else if (internal::isdigit(str[local_pos])) {
while (internal::isdigit(str[local_pos]))
++local_pos;
}
// handle precision
if (str[local_pos] == '.') {
++local_pos;
if (str[local_pos] == '*') {
++local_pos;
size_t precision_index = parse_index(&local_pos);
set_type_desc(precision_index, TYPE_DESC<int>);
if (precision_index == index)
return TYPE_DESC<int>;
} else if (internal::isdigit(str[local_pos])) {
while (internal::isdigit(str[local_pos]))
++local_pos;
}
}
LengthModifier lm = parse_length_modifier(&local_pos);
// if we don't have an index for this conversion, then its position is
// unknown and all this information is irrelevant. The rest of this logic
// has been for skipping past this conversion properly to avoid
// weirdness with %%.
if (conv_index == 0) {
++local_pos;
continue;
}
TypeDesc conv_size = TYPE_DESC<void>;
switch (str[local_pos]) {
case ('%'):
conv_size = TYPE_DESC<void>;
break;
case ('c'):
conv_size = TYPE_DESC<int>;
break;
case ('d'):
case ('i'):
case ('o'):
case ('x'):
case ('X'):
case ('u'):
switch (lm) {
case (LengthModifier::hh):
case (LengthModifier::h):
case (LengthModifier::none):
conv_size = TYPE_DESC<int>;
break;
case (LengthModifier::l):
conv_size = TYPE_DESC<long>;
break;
case (LengthModifier::ll):
case (LengthModifier::L): // This isn't in the standard, but is in other
// libc implementations.
conv_size = TYPE_DESC<long long>;
break;
case (LengthModifier::j):
conv_size = TYPE_DESC<intmax_t>;
break;
case (LengthModifier::z):
conv_size = TYPE_DESC<size_t>;
break;
case (LengthModifier::t):
conv_size = TYPE_DESC<ptrdiff_t>;
break;
}
break;
case ('f'):
case ('F'):
case ('e'):
case ('E'):
case ('a'):
case ('A'):
case ('g'):
case ('G'):
if (lm != LengthModifier::L)
conv_size = TYPE_DESC<double>;
else
conv_size = TYPE_DESC<long double>;
break;
case ('n'):
case ('p'):
case ('s'):
conv_size = TYPE_DESC<void *>;
break;
default:
conv_size = TYPE_DESC<int>;
break;
}
set_type_desc(conv_index, conv_size);
if (conv_index == index)
return conv_size;
}
++local_pos;
}
// If there is no size for the requested index, then just guess that it's an
// int.
return TYPE_DESC<int>;
}
void Parser::args_to_index(size_t index) {
if (args_index > index) {
args_index = 1;
args_cur = args_start;
}
while (args_index < index) {
Parser::TypeDesc cur_type_desc = TYPE_DESC<void>;
if (args_index <= DESC_ARR_LEN)
cur_type_desc = desc_arr[args_index - 1];
if (cur_type_desc == TYPE_DESC<void>)
cur_type_desc = get_type_desc(args_index);
if (cur_type_desc == TYPE_DESC<uint32_t>)
args_cur.next_var<uint32_t>();
else if (cur_type_desc == TYPE_DESC<uint64_t>)
args_cur.next_var<uint64_t>();
// Floating point numbers are stored separately from the other arguments.
else if (cur_type_desc == TYPE_DESC<double>)
args_cur.next_var<double>();
else if (cur_type_desc == TYPE_DESC<long double>)
args_cur.next_var<long double>();
// pointers may be stored separately from normal values.
else if (cur_type_desc == TYPE_DESC<void *>)
args_cur.next_var<void *>();
else
args_cur.next_var<uint32_t>();
++args_index;
}
}
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
} // namespace printf_core
} // namespace __llvm_libc

View File

@ -11,6 +11,7 @@
#include "src/__support/arg_list.h"
#include "src/stdio/printf_core/core_structs.h"
#include "src/string/memory_utils/memset_implementations.h"
#include <stddef.h>
@ -21,16 +22,50 @@ class Parser {
const char *__restrict str;
size_t cur_pos = 0;
internal::ArgList args_start;
internal::ArgList args_cur;
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
// args_start stores the start of the va_args, which is allows getting the
// value of arguments that have already been passed. args_index is tracked so
// that we know which argument args_cur is on.
internal::ArgList args_start;
size_t args_index = 1;
enum PrimaryType : uint8_t { Integer = 0, Float = 1, Pointer = 2 };
// TypeDesc stores the information about a type that is relevant to printf in
// a relatively compact manner.
struct TypeDesc {
uint8_t size;
PrimaryType primary_type;
constexpr bool operator==(const TypeDesc &other) const {
return (size == other.size) && (primary_type == other.primary_type);
}
};
// TODO: Make this size configurable via a compile option.
static constexpr size_t DESC_ARR_LEN = 32;
// desc_arr stores the sizes of the variables in the ArgList. This is used in
// index mode to reduce repeated string parsing. The sizes are stored as
// TypeDesc objects, which store the size as well as minimal type information.
// This is necessary because some systems separate the floating point and
// integer values in va_args.
TypeDesc desc_arr[DESC_ARR_LEN];
// TODO: Look into object stores for optimization.
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
public:
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
Parser(const char *__restrict new_str, internal::ArgList &args)
: str(new_str), args_start(args), args_cur(args) {}
: str(new_str), args_cur(args), args_start(args) {
inline_memset(reinterpret_cast<char *>(desc_arr), 0,
DESC_ARR_LEN * sizeof(TypeDesc));
}
#else
Parser(const char *__restrict new_str, internal::ArgList &args)
: str(new_str), args_cur(args) {}
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
// get_next_section will parse the format string until it has a fully
// specified format section. This can either be a raw format section with no
@ -53,9 +88,68 @@ private:
// get_next_arg_value gets the next value from the arg list as type T.
template <class T> T inline get_next_arg_value() {
++args_index;
return args_cur.next_var<T>();
}
//----------------------------------------------------
// INDEX MODE ONLY FUNCTIONS AFTER HERE:
//----------------------------------------------------
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
// parse_index parses the index of a value inside a format string. It
// assumes that str[*local_pos] points to character after a '%' or '*', and
// returns 0 if there is no closing $, or if it finds no number. If it finds a
// number, it will move local_pos past the end of the $, else it will not move
// local_pos.
size_t parse_index(size_t *local_pos);
template <typename T>
static constexpr TypeDesc TYPE_DESC{sizeof(T), PrimaryType::Integer};
template <>
static constexpr TypeDesc TYPE_DESC<double>{sizeof(double),
PrimaryType::Float};
template <>
static constexpr TypeDesc TYPE_DESC<long double>{sizeof(long double),
PrimaryType::Float};
template <>
static constexpr TypeDesc TYPE_DESC<void *>{sizeof(void *),
PrimaryType::Pointer};
template <>
static constexpr TypeDesc TYPE_DESC<void>{0, PrimaryType::Integer};
void inline set_type_desc(size_t index, TypeDesc value) {
if (index != 0 && index <= DESC_ARR_LEN)
desc_arr[index - 1] = value;
}
// get_arg_value gets the value from the arg list at index (starting at 1).
// This may require parsing the format string. An index of 0 is interpreted as
// the next value.
template <class T> T inline get_arg_value(size_t index) {
if (!(index == 0 || index == args_index))
args_to_index(index);
set_type_desc(index, TYPE_DESC<T>);
++args_index;
return get_next_arg_value<T>();
}
// the ArgList can only return the next item in the list. This function is
// used in index mode when the item that needs to be read is not the next one.
// It moves cur_args to the index requested so the the appropriate value may
// be read. This may involve parsing the format string, and is in the worst
// case an O(n^2) operation.
void args_to_index(size_t index);
// get_type_desc assumes that this format string uses index mode. It iterates
// through the format string until it finds a format specifier that defines
// the type of index, and returns a TypeDesc describing that type. It does not
// modify cur_pos.
TypeDesc get_type_desc(size_t index);
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
};
} // namespace printf_core

View File

@ -260,3 +260,199 @@ TEST(LlvmLibcPrintfParserTest, EvalThreeArgs) {
ASSERT_FORMAT_EQ(expected2, format_arr[2]);
}
#ifndef LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE
TEST(LlvmLibcPrintfParserTest, IndexModeOneArg) {
__llvm_libc::printf_core::FormatSection format_arr[10];
const char *str = "%1$d";
int arg1 = 12345;
evaluate(format_arr, str, arg1);
__llvm_libc::printf_core::FormatSection expected;
expected.has_conv = true;
expected.raw_len = 4;
expected.raw_string = str;
expected.conv_val_raw = arg1;
expected.conv_name = 'd';
ASSERT_FORMAT_EQ(expected, format_arr[0]);
}
TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsSequential) {
__llvm_libc::printf_core::FormatSection format_arr[10];
const char *str = "%1$d%2$f%3$s";
int arg1 = 12345;
double arg2 = 123.45;
const char *arg3 = "12345";
evaluate(format_arr, str, arg1, arg2, arg3);
__llvm_libc::printf_core::FormatSection expected0, expected1, expected2;
expected0.has_conv = true;
expected0.raw_len = 4;
expected0.raw_string = str;
expected0.conv_val_raw = arg1;
expected0.conv_name = 'd';
ASSERT_FORMAT_EQ(expected0, format_arr[0]);
expected1.has_conv = true;
expected1.raw_len = 4;
expected1.raw_string = str + 4;
expected1.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
expected1.conv_name = 'f';
ASSERT_FORMAT_EQ(expected1, format_arr[1]);
expected2.has_conv = true;
expected2.raw_len = 4;
expected2.raw_string = str + 8;
expected2.conv_val_ptr = const_cast<char *>(arg3);
expected2.conv_name = 's';
ASSERT_FORMAT_EQ(expected2, format_arr[2]);
}
TEST(LlvmLibcPrintfParserTest, IndexModeThreeArgsReverse) {
__llvm_libc::printf_core::FormatSection format_arr[10];
const char *str = "%3$d%2$f%1$s";
int arg1 = 12345;
double arg2 = 123.45;
const char *arg3 = "12345";
evaluate(format_arr, str, arg3, arg2, arg1);
__llvm_libc::printf_core::FormatSection expected0, expected1, expected2;
expected0.has_conv = true;
expected0.raw_len = 4;
expected0.raw_string = str;
expected0.conv_val_raw = arg1;
expected0.conv_name = 'd';
ASSERT_FORMAT_EQ(expected0, format_arr[0]);
expected1.has_conv = true;
expected1.raw_len = 4;
expected1.raw_string = str + 4;
expected1.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
expected1.conv_name = 'f';
ASSERT_FORMAT_EQ(expected1, format_arr[1]);
expected2.has_conv = true;
expected2.raw_len = 4;
expected2.raw_string = str + 8;
expected2.conv_val_ptr = const_cast<char *>(arg3);
expected2.conv_name = 's';
ASSERT_FORMAT_EQ(expected2, format_arr[2]);
}
TEST(LlvmLibcPrintfParserTest, IndexModeTenArgsRandom) {
__llvm_libc::printf_core::FormatSection format_arr[10];
const char *str = "%6$d%3$d%7$d%2$d%8$d%1$d%4$d%9$d%5$d%10$d";
int args[10] = {6, 4, 2, 7, 9, 1, 3, 5, 8, 10};
evaluate(format_arr, str, args[0], args[1], args[2], args[3], args[4],
args[5], args[6], args[7], args[8], args[9]);
for (size_t i = 0; i < 10; ++i) {
__llvm_libc::printf_core::FormatSection expected;
expected.has_conv = true;
expected.raw_len = 4 + (i >= 9 ? 1 : 0);
expected.raw_string = str + (4 * i);
expected.conv_val_raw = i + 1;
expected.conv_name = 'd';
EXPECT_FORMAT_EQ(expected, format_arr[i]);
}
}
TEST(LlvmLibcPrintfParserTest, IndexModeComplexParsing) {
__llvm_libc::printf_core::FormatSection format_arr[10];
const char *str = "normal text %3$llu %% %2$ *4$f %2$ .*4$f %1$1.1c";
char arg1 = '1';
double arg2 = 123.45;
unsigned long long arg3 = 12345;
int arg4 = 10;
evaluate(format_arr, str, arg1, arg2, arg3, arg4);
__llvm_libc::printf_core::FormatSection expected0, expected1, expected2,
expected3, expected4, expected5, expected6, expected7, expected8,
expected9;
expected0.has_conv = false;
expected0.raw_len = 12;
expected0.raw_string = str;
EXPECT_FORMAT_EQ(expected0, format_arr[0]);
expected1.has_conv = true;
expected1.raw_len = 6;
expected1.raw_string = str + 12;
expected1.length_modifier = __llvm_libc::printf_core::LengthModifier::ll;
expected1.conv_val_raw = arg3;
expected1.conv_name = 'u';
EXPECT_FORMAT_EQ(expected1, format_arr[1]);
expected2.has_conv = false;
expected2.raw_len = 1;
expected2.raw_string = str + 18;
EXPECT_FORMAT_EQ(expected2, format_arr[2]);
expected3.has_conv = true;
expected3.raw_len = 2;
expected3.raw_string = str + 19;
expected3.conv_name = '%';
EXPECT_FORMAT_EQ(expected3, format_arr[3]);
expected4.has_conv = false;
expected4.raw_len = 1;
expected4.raw_string = str + 21;
EXPECT_FORMAT_EQ(expected4, format_arr[4]);
expected5.has_conv = true;
expected5.raw_len = 8;
expected5.raw_string = str + 22;
expected5.flags = __llvm_libc::printf_core::FormatFlags::SPACE_PREFIX;
expected5.min_width = arg4;
expected5.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
expected5.conv_name = 'f';
EXPECT_FORMAT_EQ(expected5, format_arr[5]);
expected6.has_conv = false;
expected6.raw_len = 1;
expected6.raw_string = str + 30;
EXPECT_FORMAT_EQ(expected6, format_arr[6]);
expected7.has_conv = true;
expected7.raw_len = 9;
expected7.raw_string = str + 31;
expected7.flags = __llvm_libc::printf_core::FormatFlags::SPACE_PREFIX;
expected7.precision = arg4;
expected7.conv_val_raw = __llvm_libc::bit_cast<uint64_t>(arg2);
expected7.conv_name = 'f';
EXPECT_FORMAT_EQ(expected7, format_arr[7]);
expected8.has_conv = false;
expected8.raw_len = 1;
expected8.raw_string = str + 40;
EXPECT_FORMAT_EQ(expected8, format_arr[8]);
expected9.has_conv = true;
expected9.raw_len = 7;
expected9.raw_string = str + 41;
expected9.min_width = 1;
expected9.precision = 1;
expected9.conv_val_raw = arg1;
expected9.conv_name = 'c';
EXPECT_FORMAT_EQ(expected9, format_arr[9]);
}
#endif // LLVM_LIBC_PRINTF_DISABLE_INDEX_MODE