Add the r_json json parser based on nxjson ##util (#17439)

2020-08-14 20:57:09 +02:00 · 2020-08-14 20:57:09 +02:00 · 3ef9c81da9
parent 45994dcc1f
commit 3ef9c81da9
8 changed files with 1707 additions and 5 deletions
--- a/libr/include/r_util/r_json.h
+++ b/libr/include/r_util/r_json.h
@ -0,0 +1,67 @@
+/* radare - LGPL - Copyright 2020 - thestr4ng3r, Yaroslav Stavnichiy */
+
+#ifndef R_JSON_H
+#define R_JSON_H
+
+#include <r_types.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*
+ * r_json is a JSON parsing API,
+ * heavily based on nxjson by Yaroslav Stavnichiy <yarosla@gmail.com>,
+ * which is available under LGPLv3 or later.
+ *
+ * r_json does NOT format json, it only parses. To format json, see pj.h instead.
+ * It operates in-place, which means the parsed string will be MODIFIED.
+ * This means all string values in RJson point directly into the input string,
+ * removing the need to copy them.
+ *
+ * It also supports both line and block style comments.
+ */
+
+typedef enum r_json_type_t {
+	R_JSON_NULL,
+	R_JSON_OBJECT,  // properties can be found in child nodes
+	R_JSON_ARRAY,   // items can be found in child nodes
+	R_JSON_STRING,  // value can be found in the str_value field
+	R_JSON_INTEGER, // value can be found in the num.u_value/num.s_value fields
+	R_JSON_DOUBLE,  // value can be found in the num.dbl_value field
+	R_JSON_BOOLEAN  // value can be found in the num.u_value field
+} RJsonType;
+
+typedef struct r_json_t {
+	RJsonType type;             // type of json node, see above
+	const char *key;            // key of the property; for object's children only
+	union {
+		const char *str_value;  // text value of STRING node
+		struct {
+			union {
+				ut64 u_value;   // the value of INTEGER or BOOLEAN node
+				st64 s_value;
+			};
+			double dbl_value;   // the value of DOUBLE node
+		} num;
+		struct {                // children of OBJECT or ARRAY
+			size_t count;
+			struct r_json_t *first;
+			struct r_json_t *last;
+		} children;
+	};
+	struct r_json_t *next;    // points to next child
+} RJson;
+
+R_API RJson *r_json_parse(char *text);
+
+R_API void r_json_free(RJson *js);
+
+R_API const RJson *r_json_get(const RJson *json, const char *key); // get object's property by key
+R_API const RJson *r_json_item(const RJson *json, size_t idx); // get array element by index
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* NXJSON_H */
--- a/libr/meson.build
+++ b/libr/meson.build
@ -403,6 +403,7 @@ r_util_files = [
  'include/r_util/r_hex.h',
  'include/r_util/r_idpool.h',
  'include/r_util/r_itv.h',
+  'include/r_util/r_json.h',
  'include/r_util/r_log.h',
  'include/r_util/r_mem.h',
  'include/r_util/r_name.h',
--- a/libr/util/Makefile
+++ b/libr/util/Makefile
@ -20,8 +20,8 @@ OBJS+=strpool.o bitmap.o time.o format.o pie.o print.o utype.o
 OBJS+=seven.o randomart.o zip.o debruijn.o log.o getopt.o table.o
 OBJS+=utf8.o utf16.o utf32.o strbuf.o lib.o name.o spaces.o signal.o syscmd.o
 OBJS+=udiff.o bdiff.o stack.o queue.o tree.o idpool.o assert.o
-OBJS+=punycode.o pkcs7.o x509.o asn1.o astr.o json_indent.o skiplist.o pj.o
-OBJS+=rbtree.o intervaltree.o qrcode.o vector.o str_constpool.o str_trim.o
+OBJS+=punycode.o pkcs7.o x509.o asn1.o astr.o json.o json_indent.o skiplist.o
+OBJS+=pj.o rbtree.o intervaltree.o qrcode.o vector.o str_constpool.o str_trim.o
 OBJS+=ascii_table.o protobuf.o
 OBJS+=annotated_code.o

--- a/libr/util/json.c
+++ b/libr/util/json.c
@ -0,0 +1,407 @@
+/* radare - LGPL - Copyright 2020 - thestr4ng3r, Yaroslav Stavnichiy */
+
+#include <errno.h>
+
+#include <r_util/r_utf8.h>
+#include <r_util/r_hex.h>
+#include <r_util/r_json.h>
+
+#if 0
+// optional error printing
+#define R_JSON_REPORT_ERROR(msg, p) fprintf(stderr, "R_JSON PARSE ERROR (%d): " msg " at %s\n", __LINE__, p)
+#else
+#define R_JSON_REPORT_ERROR(msg, p) do { (void)(msg); (void)(p); } while (0)
+#endif
+
+static RJson *json_new(void) {
+	return R_NEW0 (RJson);
+}
+
+static RJson *create_json(RJsonType type, const char *key, RJson *parent) {
+	RJson *js = json_new ();
+	if (!js) {
+		return NULL;
+	}
+	js->type = type;
+	js->key = key;
+	if (!parent->children.last) {
+		parent->children.first = parent->children.last = js;
+	} else {
+		parent->children.last->next = js;
+		parent->children.last = js;
+	}
+	parent->children.count++;
+	return js;
+}
+
+R_API void r_json_free(RJson *js) {
+	if (!js) {
+		return;
+	}
+	if (js->type == R_JSON_OBJECT || js->type == R_JSON_ARRAY) {
+		RJson *p = js->children.first;
+		RJson *p1;
+		while (p) {
+			p1 = p->next;
+			r_json_free (p);
+			p = p1;
+		}
+	}
+	free (js);
+}
+
+static char *unescape_string(char *s, char **end) {
+	char *p = s;
+	char *d = s;
+	char c;
+	while ((c = *p++)) {
+		if (c == '"') {
+			*d = '\0';
+			*end = p;
+			return s;
+		}
+		if (c == '\\') {
+			switch (*p) {
+			case '\\':
+			case '/':
+			case '"':
+				*d++ = *p++;
+				break;
+			case 'b':
+				*d++ = '\b';
+				p++;
+				break;
+			case 'f':
+				*d++ = '\f';
+				p++;
+				break;
+			case 'n':
+				*d++ = '\n';
+				p++;
+				break;
+			case 'r':
+				*d++ = '\r';
+				p++;
+				break;
+			case 't':
+				*d++ = '\t';
+				p++;
+				break;
+			case 'u': { // unicode
+				char *ps = p - 1;
+				ut8 high = 0, low = 0;
+				if (r_hex_to_byte (&high, p[1]) || r_hex_to_byte (&high, p[2])
+						|| r_hex_to_byte (&low, p[3]) || r_hex_to_byte (&low, p[4])) {
+					R_JSON_REPORT_ERROR ("invalid unicode escape", p - 1);
+					return NULL;
+				}
+				RRune codepoint = (RRune)high << 8 | (RRune)low;
+				if ((codepoint & 0xfc00) == 0xd800) { // high surrogate; need one more unicode to succeed
+					p += 6;
+					high = low = 0;
+					if (p[-1] != '\\' || *p != 'u'
+							|| r_hex_to_byte (&high, p[1]) || r_hex_to_byte (&high, p[2])
+							|| r_hex_to_byte (&low, p[3]) || r_hex_to_byte (&low, p[4])) {
+						R_JSON_REPORT_ERROR ("invalid unicode surrogate", ps);
+						return NULL;
+					}
+					RRune codepoint2 = (RRune)high << 8 | (RRune)low;
+					if ((codepoint2 & 0xfc00) != 0xdc00) {
+						R_JSON_REPORT_ERROR ("invalid unicode surrogate", ps);
+						return NULL;
+					}
+					codepoint = 0x10000 + ((codepoint - 0xd800) << 10) + (codepoint2 - 0xdc00);
+				}
+				int sz = r_utf8_encode ((ut8 *)d, codepoint);
+				if (!s) {
+					R_JSON_REPORT_ERROR ("invalid codepoint", ps);
+					return NULL;
+				}
+				d += sz;
+				p += 5;
+				break;
+			}
+			default:
+				// leave untouched
+				*d++ = c;
+				break;
+			}
+		} else {
+			*d++ = c;
+		}
+	}
+	R_JSON_REPORT_ERROR ("no closing quote for string", s);
+	return NULL;
+}
+
+static char *skip_block_comment(char *ps) {
+	// ps is at "/* ..."
+	// caller must ensure that ps[0], ps[1] and ps[2] are valid.
+	char *p = ps + 2;
+	if (!*p) {
+		R_JSON_REPORT_ERROR ("endless comment", ps);
+		return NULL;
+	}
+	REPEAT:
+	p = strchr (p + 1, '/');
+	if (!p) {
+		R_JSON_REPORT_ERROR ("endless comment", ps);
+		return NULL;
+	}
+	if (p[-1] != '*') {
+		goto REPEAT;
+	}
+	return p + 1;
+}
+
+static char *skip_whitespace(char *p) {
+	while (*p) {
+		if (*p == '/') {
+			if (p[1] == '/') { // line comment
+				char *ps = p;
+				p = strchr (p + 2, '\n');
+				if (!p) {
+					R_JSON_REPORT_ERROR ("endless comment", ps);
+					return NULL; // error
+				}
+				p++;
+			} else if (p[1] == '*') { // block comment
+				p = skip_block_comment (p);
+				if (!p) {
+					return NULL;
+				}
+				continue;
+			} else {
+				R_JSON_REPORT_ERROR ("unexpected chars", p);
+				return NULL; // error
+			}
+			continue;
+		} else if (!IS_WHITECHAR (*p)) {
+			break;
+		}
+		p++;
+	}
+	return p;
+}
+
+static char *parse_key(const char **key, char *p) {
+	// on '}' return with *p=='}'
+	p = skip_whitespace (p);
+	if (!p) {
+		return NULL;
+	}
+	char c;
+	while ((c = *p++)) {
+		if (c == '"') {
+			*key = unescape_string (p, &p);
+			if (!*key) {
+				return NULL; // propagate error
+			}
+			p = skip_whitespace (p);
+			if (!p) {
+				return NULL;
+			}
+			if (*p == ':') {
+				return p + 1;
+			}
+			R_JSON_REPORT_ERROR ("unexpected chars", p);
+			return NULL;
+		}
+		if (c == '}') {
+			return p - 1;
+		}
+		R_JSON_REPORT_ERROR ("unexpected chars", p - 1);
+		return NULL; // error
+	}
+	R_JSON_REPORT_ERROR ("unexpected chars", p - 1);
+	return NULL; // error
+}
+
+static char *parse_value(RJson *parent, const char *key, char *p) {
+	RJson *js;
+	p = skip_whitespace (p);
+	if (!p) {
+		return NULL;
+	}
+	switch (*p) {
+	case '\0':
+		R_JSON_REPORT_ERROR ("unexpected end of text", p);
+		return NULL; // error
+	case '{':
+		js = create_json (R_JSON_OBJECT, key, parent);
+		p++;
+		while (1) {
+			const char *new_key;
+			p = parse_key (&new_key, p);
+			if (!p) {
+				return NULL; // error
+			}
+			if (*p != '}') {
+				p = parse_value (js, new_key, p);
+				if (!p) {
+					return NULL; // error
+				}
+			}
+			p = skip_whitespace (p);
+			if (!p) {
+				return NULL;
+			}
+			if (*p == ',') {
+				char *commapos = p;
+				p++;
+				p = skip_whitespace (p);
+				if (!p) {
+					return NULL;
+				}
+				if (*p == '}') {
+					R_JSON_REPORT_ERROR ("trailing comma", commapos);
+					return NULL;
+				}
+			} else if (*p == '}') {
+				return p + 1; // end of object
+			} else {
+				R_JSON_REPORT_ERROR ("unexpected chars", p);
+				return NULL;
+			}
+		}
+	case '[':
+		js = create_json (R_JSON_ARRAY, key, parent);
+		p++;
+		while (1) {
+			p = parse_value (js, 0, p);
+			if (!p) {
+				return NULL; // error
+			}
+			p = skip_whitespace (p);
+			if (!p) {
+				return NULL;
+			}
+			if (*p == ',') {
+				char *commapos = p;
+				p++;
+				p = skip_whitespace (p);
+				if (!p) {
+					return NULL;
+				}
+				if (*p == ']') {
+					R_JSON_REPORT_ERROR ("trailing comma", commapos);
+					return NULL;
+				}
+			} else if (*p == ']') {
+				return p + 1; // end of array
+			} else {
+				R_JSON_REPORT_ERROR ("unexpected chars", p);
+				return NULL;
+			}
+		}
+	case ']':
+		return p;
+	case '"':
+		p++;
+		js = create_json (R_JSON_STRING, key, parent);
+		js->str_value = unescape_string (p, &p);
+		if (!js->str_value) {
+			return NULL; // propagate error
+		}
+		return p;
+	case '-':
+	case '0':
+	case '1':
+	case '2':
+	case '3':
+	case '4':
+	case '5':
+	case '6':
+	case '7':
+	case '8':
+	case '9': {
+		js = create_json (R_JSON_INTEGER, key, parent);
+		errno = 0;
+		char *pe;
+		if (*p == '-') {
+			js->num.s_value = (st64)strtoll (p, &pe, 10);
+		} else {
+			js->num.u_value = (ut64)strtoull (p, &pe, 10);
+		}
+		if (pe == p || errno == ERANGE) {
+			R_JSON_REPORT_ERROR ("invalid number", p);
+			return NULL; // error
+		}
+		if (*pe == '.' || *pe == 'e' || *pe == 'E') { // double value
+			js->type = R_JSON_DOUBLE;
+			errno = 0;
+			js->num.dbl_value = strtod (p, &pe);
+			if (pe == p || errno == ERANGE) {
+				R_JSON_REPORT_ERROR ("invalid fractional number", p);
+				return NULL; // error
+			}
+		} else {
+			if (*p == '-') {
+				js->num.dbl_value = js->num.s_value;
+			} else {
+				js->num.dbl_value = js->num.u_value;
+			}
+		}
+		return pe;
+	}
+	case 't':
+		if (!strncmp (p, "true", 4)) {
+			js = create_json (R_JSON_BOOLEAN, key, parent);
+			js->num.u_value = 1;
+			return p + 4;
+		}
+		R_JSON_REPORT_ERROR ("unexpected chars", p);
+		return NULL; // error
+	case 'f':
+		if (!strncmp (p, "false", 5)) {
+			js = create_json (R_JSON_BOOLEAN, key, parent);
+			js->num.u_value = 0;
+			return p + 5;
+		}
+		R_JSON_REPORT_ERROR ("unexpected chars", p);
+		return NULL; // error
+	case 'n':
+		if (!strncmp (p, "null", 4)) {
+			create_json (R_JSON_NULL, key, parent);
+			return p + 4;
+		}
+		R_JSON_REPORT_ERROR ("unexpected chars", p);
+		return NULL; // error
+	default:
+		R_JSON_REPORT_ERROR ("unexpected chars", p);
+		return NULL; // error
+	}
+	return NULL;
+}
+
+R_API RJson *r_json_parse(char *text) {
+	RJson js = {0};
+	if (!parse_value (&js, 0, text)) {
+		if (js.children.first) {
+			r_json_free (js.children.first);
+		}
+		return 0;
+	}
+	return js.children.first;
+}
+
+R_API const RJson *r_json_get(const RJson *json, const char *key) {
+	RJson *js;
+	for (js = json->children.first; js; js = js->next) {
+		if (js->key && !strcmp (js->key, key)) {
+			return js;
+		}
+	}
+	return NULL;
+}
+
+R_API const RJson *r_json_item(const RJson *json, size_t idx) {
+	RJson *js;
+	for (js = json->children.first; js; js = js->next) {
+		if (!idx--) {
+			return js;
+		}
+	}
+	return NULL;
+}
+
--- a/libr/util/meson.build
+++ b/libr/util/meson.build
@ -24,6 +24,7 @@ r_util_sources = [
  'graph.c',
  'hex.c',
  'idpool.c',
+  'json.c',
  'json_indent.c',
  'lib.c',
  'list.c',
--- a/test/unit/meson.build
+++ b/test/unit/meson.build
@ -33,6 +33,7 @@ if get_option('enable_tests')
    'hex',
    'intervaltree',
    'io',
+    'json',
    'list',
    'parse_ctype',
    'pj',
--- a/test/unit/minunit.h
+++ b/test/unit/minunit.h
@ -176,13 +176,15 @@ void sprint_mem(char *out, const ut8 *buf, size_t len) {
 		mu_assert(_meqstr, memcmp((exp__), (act__), (len)) == 0); \
 } while(0)

-#define mu_run_test(test) do { int result; \
-		printf(TBOLD #test TRESET " "); \
-		result = test(); \
+#define mu_run_test_named(test, name, ...) do { int result; \
+		printf(TBOLD "%s" TRESET " ", name); \
+		result = test(__VA_ARGS__); \
 		tests_run++; \
 		tests_passed += result; \
 } while (0)

+#define mu_run_test(test, ...) mu_run_test_named (test, #test, __VA_ARGS__)
+
 #define mu_cleanup_fail(label, message) do { mu_perror(message); retval = MU_ERR; goto label; } while(0)
 #define mu_cleanup_sysfail(label, message) do { mu_psyserror(message); retval = MU_ERR; goto label; } while(0)
 int tests_run = 0;
--- a/test/unit/test_json.c
+++ b/test/unit/test_json.c