Compile new shell parser by default (#16660)

* Move radare2-shell-parser/tree-sitter into shlr
* compile new shell parser by default, available with cfg.newshell variable
* Add README for radare2-shell-parser
* Improve CI
* Add gitattributes file
This commit is contained in:
Riccardo Schirone 2020-04-21 13:44:30 +02:00 committed by GitHub
parent 1b56d63df2
commit 06ab29b93c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
84 changed files with 69756 additions and 228 deletions

3
.gitattributes vendored Normal file
View File

@ -0,0 +1,3 @@
shlr/radare2-shell-parser/src/grammar.json linguist-generated=true
shlr/radare2-shell-parser/src/node-types.json linguist-generated=true
shlr/radare2-shell-parser/src/parser.c linguist-generated=true

View File

@ -0,0 +1,26 @@
name: Radare2 CI tree-sitter test
on:
pull_request:
paths:
- 'shlr/tree-sitter/*'
- 'shlr/radare2-shell-parser/*'
branches:
- master
jobs:
build:
name: radare2-shell-parser-tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-node@v1
with:
node-version: '12.x'
- run: cd shlr/radare2-shell-parser/ && npm install
- name: Run tests
run: |
cd shlr/radare2-shell-parser
export PATH=${PATH}:./node_modules/.bin
tree-sitter generate
tree-sitter test

View File

@ -1,58 +0,0 @@
name: Radare2 CI newshell
on:
pull_request:
branches:
- master
push:
branches:
- master
jobs:
build:
name: ${{ matrix.name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
name: [linux-meson-newshell]
include:
- name: linux-meson-newshell
os: ubuntu-latest
build_system: meson
compiler: gcc
meson_options: -Duse_treesitter=true
newshell: newshell
steps:
- uses: actions/checkout@v2
- name: Install meson and ninja
run: sudo apt-get --assume-yes install python3-wheel python3-setuptools && pip3 install --user meson ninja
- name: Checkout our Testsuite Binaries
uses: actions/checkout@v2
with:
repository: radareorg/radare2-testbins
path: test/bins
- name: Build with Meson
run: |
export PATH=${HOME}/.local/bin:${PATH}
meson ${{ matrix.meson_options }} --prefix=${HOME} build && ninja -C build
env:
CC: ${{ matrix.compiler }}
- name: Install with meson
run: |
# Install the radare2
export PATH=${HOME}/bin:${HOME}/.local/bin:${PATH}
export LD_LIBRARY_PATH=${HOME}/lib/$(uname -m)-linux-gnu:${HOME}/lib:${HOME}/lib64:${LD_LIBRARY_PATH}
export PKG_CONFIG_PATH=${HOME}/lib/pkgconfig:${HOME}/lib/$(uname -m)-linux-gnu/pkgconfig:${PKG_CONFIG_PATH}
ninja -C build install
- name: Run tests
if: startswith(github.event.pull_request.head.ref, 'newshell-') || github.event_name == 'push'
run: |
# Running the test suite
export PATH=${HOME}/bin:${HOME}/.local/bin:${PATH}
export LD_LIBRARY_PATH=${HOME}/lib/$(uname -m)-linux-gnu:${HOME}/lib:${HOME}/lib64:${LD_LIBRARY_PATH}
export PKG_CONFIG_PATH=${HOME}/lib/pkgconfig:${HOME}/lib/$(uname -m)-linux-gnu/pkgconfig:${PKG_CONFIG_PATH}
export R2_CFG_NEWSHELL=1
cd test
make

View File

@ -11,7 +11,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
name: [linux-acr-gcc-tests, linux-acr-clang-build, linux-meson-gcc-build, macos-clang-tests]
name: [linux-acr-gcc-tests, linux-acr-clang-build, linux-meson-gcc-build, linux-meson-gcc-newshell-tests, macos-clang-tests]
include:
- name: linux-acr-gcc-tests
os: ubuntu-latest
@ -26,6 +26,12 @@ jobs:
os: ubuntu-latest
build_system: meson
compiler: gcc
- name: linux-meson-gcc-newshell-tests
os: ubuntu-latest
build_system: meson
compiler: gcc
run_tests: true
newshell: true
- name: macos-clang-tests
os: macos-latest
build_system: acr
@ -83,6 +89,12 @@ jobs:
export PATH=${HOME}/bin:${HOME}/.local/bin:${PATH}
export LD_LIBRARY_PATH=${HOME}/lib/$(uname -m)-linux-gnu:${HOME}/lib:${HOME}/lib64:${LD_LIBRARY_PATH}
export PKG_CONFIG_PATH=${HOME}/lib/pkgconfig:${HOME}/lib/$(uname -m)-linux-gnu/pkgconfig:${PKG_CONFIG_PATH}
if [ "$NEWSHELL" == "true" ]; then
export R2_CFG_NEWSHELL=1
fi
cd test
radare2 -N -Qc 'e cfg.newshell' -
make
env:
NEWSHELL: ${{ matrix.newshell }}

2
.gitignore vendored
View File

@ -58,8 +58,6 @@ libr/include/r_userconf.h
libr/include/r_version.h
libr/include/r_version.h.tmp
shlr/capstone/
shlr/tree-sitter/
shlr/radare2-shell-parser/
shlr/java/out
shlr/java/out.exe
shlr/sdb/sdb

24
configure vendored
View File

@ -22,7 +22,6 @@ DEBUGGER=1
USE_MAGIC=0
CSNEXT=0
LOADLIBS=1
USE_TREESITTER=0
HAVE_FORK=1
WANT_PTRACE_WRAP=1
WITH_LIBR=0
@ -168,13 +167,11 @@ System types:
--target=TARGET configure for building compilers for TARGET [HOST]
EOF2
printf "
Optional Features:
printf "\nOptional Features:
--disable-debugger disable native debugger features
--with-sysmagic force to use system's magic
--with-capstone5 build next branch of the capstone repository
--disable-loadlibs disable loading plugins
--with-shell-parser Compile with radare2-shell-parser experimental support
--without-fork disable fork
--without-ptrace-wrap disable ptrace-wrap build
--with-libr build libr.a and libr.dylib
@ -189,10 +186,8 @@ Optional Features:
--with-ostype Choose OS type ( gnulinux windows darwin haiku ) (USEROSTYPE=auto)
--with-libversion specify different libversion (LIBVERSION=xxx)
--without-jemalloc build without jemalloc
--with-checks-level value between 0 and 3 to enable different level of assert (see R_CHECKS_LEVEL) (R_CHECKS_LEVEL=2)
"
printf "
Some influential environment variables:
--with-checks-level value between 0 and 3 to enable different level of assert (see R_CHECKS_LEVEL) (R_CHECKS_LEVEL=2)\n"
printf "\nSome influential environment variables:
CC C compiler command
CFLAGS C compiler flags
CPPFLAGS C preprocessor flags
@ -200,10 +195,8 @@ Some influential environment variables:
nonstandard directory <lib dir>
CPPFLAGS C/C++ preprocessor flags, e.g. -I<include dir> if you have
headers in a nonstandard directory <include dir>
CPP C preprocessor
"
printf "
Report bugs to: pancake <pancake@nopcode.org>"
CPP C preprocessor\n"
printf "\nReport bugs to: pancake <pancake@nopcode.org>"
echo ""
exit 0
}
@ -245,7 +238,7 @@ echo "LANGS: c"
echo "REQUIRED: libdl"
echo "OPTIONAL: libmagic libz libzip libxxhash libssl liblibuv>=1.0.0"
echo "PKG-CONFIG: capstone openssl libuv"
echo "FLAGS: --disable-debugger --with-sysmagic --with-capstone5 --disable-loadlibs --with-shell-parser --without-fork --without-ptrace-wrap --with-libr --with-syscapstone --with-syszip --with-sysxxhash --without-gpl --with-openssl --without-libuv --with-rpath --with-compiler=gcc --with-ostype=auto --with-libversion=xxx --without-jemalloc --with-checks-level=2"
echo "FLAGS: --disable-debugger --with-sysmagic --with-capstone5 --disable-loadlibs --without-fork --without-ptrace-wrap --with-libr --with-syscapstone --with-syszip --with-sysxxhash --without-gpl --with-openssl --without-libuv --with-rpath --with-compiler=gcc --with-ostype=auto --with-libversion=xxx --without-jemalloc --with-checks-level=2"
exit 0
;;
--cache-file)
@ -295,7 +288,6 @@ echo "FLAGS: --disable-debugger --with-sysmagic --with-capstone5 --disable-l
"--with-sysmagic") USE_MAGIC="1"; ;;
"--with-capstone5") CSNEXT="1"; ;;
"--disable-loadlibs") LOADLIBS="0"; ;;
"--with-shell-parser") USE_TREESITTER="1"; ;;
"--without-fork") HAVE_FORK="0"; ;;
"--without-ptrace-wrap") WANT_PTRACE_WRAP="0"; ;;
"--with-libr") WITH_LIBR="1"; ;;
@ -328,7 +320,7 @@ parse_options "$1"
shift
done
ENVWORDS="MANDIR INFODIR LIBDIR INCLUDEDIR LOCALSTATEDIR SYSCONFDIR DATADIR DOCDIR LIBEXECDIR SBINDIR BINDIR EPREFIX PREFIX SPREFIX TARGET HOST BUILD INSTALL INSTALL_LIB INSTALL_MAN INSTALL_PROGRAM INSTALL_PROGRAM_STRIP INSTALL_DIR INSTALL_SCRIPT INSTALL_DATA HOST_OS HOST_CPU BUILD_OS BUILD_CPU TARGET_OS TARGET_CPU VERSION VERSION_MAJOR VERSION_MINOR VERSION_PATCH VERSION_NUMBER PKGNAME VPATH CONTACT CONTACT_NAME CONTACT_MAIL CC CFLAGS CPPFLAGS LDFLAGS HAVE_LANG_C DEBUGGER HAVE_LIB_DL DL_LIBS HAVE_PATCH PATCH HAVE_GIT GIT HAVE_LIB_MAGIC USE_MAGIC USE_LIB_MAGIC LIBMAGIC CSNEXT LOADLIBS USE_TREESITTER HAVE_FORK WANT_PTRACE_WRAP WITH_LIBR WITH_CAPSTONE CAPSTONE_CFLAGS CAPSTONE_LDFLAGS HAVE_PKGCFG_CAPSTONE USE_CAPSTONE HAVE_LIB_Z HAVE_LIB_ZIP USE_ZIP USE_LIB_ZIP LIBZIP HAVE_LIB_XXHASH USE_XXHASH USE_LIB_XXHASH LIBXXHASH WITH_GPL HAVE_DECL_ADDR_NO_RANDOMIZE HAVE_ARC4RANDOM_UNIFORM HAVE_EXPLICIT_BZERO HAVE_EXPLICIT_MEMSET HAVE_CLOCK_NANOSLEEP HAVE_SIGACTION HAVE_LIB_GMP HAVE_LIB_SSL SSL_CFLAGS SSL_LDFLAGS HAVE_PKGCFG_OPENSSL HAVE_OPENSSL WANT_OPENSSL HAVE_LIBUV_VERSION_1_0_0 LIBUV_CFLAGS LIBUV_LDFLAGS HAVE_PKGCFG_LIBUV HAVE_LIBUV WANT_LIBUV USE_RPATH USERCC USEROSTYPE LIBVERSION HAVE_JEMALLOC HAVE_PTRACE USE_PTRACE_WRAP R_CHECKS_LEVEL"
ENVWORDS="MANDIR INFODIR LIBDIR INCLUDEDIR LOCALSTATEDIR SYSCONFDIR DATADIR DOCDIR LIBEXECDIR SBINDIR BINDIR EPREFIX PREFIX SPREFIX TARGET HOST BUILD INSTALL INSTALL_LIB INSTALL_MAN INSTALL_PROGRAM INSTALL_PROGRAM_STRIP INSTALL_DIR INSTALL_SCRIPT INSTALL_DATA HOST_OS HOST_CPU BUILD_OS BUILD_CPU TARGET_OS TARGET_CPU VERSION VERSION_MAJOR VERSION_MINOR VERSION_PATCH VERSION_NUMBER PKGNAME VPATH CONTACT CONTACT_NAME CONTACT_MAIL CC CFLAGS CPPFLAGS LDFLAGS HAVE_LANG_C DEBUGGER HAVE_LIB_DL DL_LIBS HAVE_PATCH PATCH HAVE_GIT GIT HAVE_LIB_MAGIC USE_MAGIC USE_LIB_MAGIC LIBMAGIC CSNEXT LOADLIBS HAVE_FORK WANT_PTRACE_WRAP WITH_LIBR WITH_CAPSTONE CAPSTONE_CFLAGS CAPSTONE_LDFLAGS HAVE_PKGCFG_CAPSTONE USE_CAPSTONE HAVE_LIB_Z HAVE_LIB_ZIP USE_ZIP USE_LIB_ZIP LIBZIP HAVE_LIB_XXHASH USE_XXHASH USE_LIB_XXHASH LIBXXHASH WITH_GPL HAVE_DECL_ADDR_NO_RANDOMIZE HAVE_ARC4RANDOM_UNIFORM HAVE_EXPLICIT_BZERO HAVE_EXPLICIT_MEMSET HAVE_CLOCK_NANOSLEEP HAVE_SIGACTION HAVE_LIB_GMP HAVE_LIB_SSL SSL_CFLAGS SSL_LDFLAGS HAVE_PKGCFG_OPENSSL HAVE_OPENSSL WANT_OPENSSL HAVE_LIBUV_VERSION_1_0_0 LIBUV_CFLAGS LIBUV_LDFLAGS HAVE_PKGCFG_LIBUV HAVE_LIBUV WANT_LIBUV USE_RPATH USERCC USEROSTYPE LIBVERSION HAVE_JEMALLOC HAVE_PTRACE USE_PTRACE_WRAP R_CHECKS_LEVEL"
create_environ
@ -696,7 +688,7 @@ done
do_remove
echo
echo "Final report:"
for A in R_CHECKS_LEVEL PREFIX HAVE_LIB_GMP HAVE_OPENSSL HAVE_LIBUV USE_CAPSTONE HAVE_PTRACE USE_PTRACE_WRAP HAVE_FORK USE_TREESITTER VERSION USE_LIB_ZIP USE_LIB_MAGIC USE_LIB_XXHASH DEBUGGER CC USERCC HAVE_ARC4RANDOM_UNIFORM HAVE_EXPLICIT_BZERO HAVE_EXPLICIT_MEMSET USEROSTYPE LIBVERSION BUILD HOST TARGET ; do # REPORT
for A in R_CHECKS_LEVEL PREFIX HAVE_LIB_GMP HAVE_OPENSSL HAVE_LIBUV USE_CAPSTONE HAVE_PTRACE USE_PTRACE_WRAP HAVE_FORK VERSION USE_LIB_ZIP USE_LIB_MAGIC USE_LIB_XXHASH DEBUGGER CC USERCC HAVE_ARC4RANDOM_UNIFORM HAVE_EXPLICIT_BZERO HAVE_EXPLICIT_MEMSET USEROSTYPE LIBVERSION BUILD HOST TARGET ; do # REPORT
eval VAL="\$${A}"
[ -z "${VAL}" ] && VAL="(null)"
echo " - ${A} = ${VAL}"

View File

@ -26,8 +26,6 @@ ARG_WITH CSNEXT capstone5 build next branch of the capstone repository ;
(( useful for static builds . see sys/static.sh ))
ARG_DISABLE LOADLIBS loadlibs disable loading plugins ;
ARG_WITH USE_TREESITTER shell-parser Compile with radare2-shell-parser experimental support ;
ARG_WITHOUT HAVE_FORK fork disable fork ;
ARG_WITHOUT WANT_PTRACE_WRAP ptrace-wrap disable ptrace-wrap build ;
@ -216,7 +214,7 @@ IFEQ WANT_PTRACE_WRAP 0 ; {
ARG_WITH R_CHECKS_LEVEL=2 checks-level value between 0 and 3 to enable different level of assert (see R_CHECKS_LEVEL) ;
REPORT R_CHECKS_LEVEL PREFIX HAVE_LIB_GMP HAVE_OPENSSL HAVE_LIBUV USE_CAPSTONE HAVE_PTRACE USE_PTRACE_WRAP HAVE_FORK
USE_TREESITTER VERSION USE_LIB_ZIP USE_LIB_MAGIC USE_LIB_XXHASH DEBUGGER CC USERCC HAVE_ARC4RANDOM_UNIFORM
VERSION USE_LIB_ZIP USE_LIB_MAGIC USE_LIB_XXHASH DEBUGGER CC USERCC HAVE_ARC4RANDOM_UNIFORM
HAVE_EXPLICIT_BZERO HAVE_EXPLICIT_MEMSET USEROSTYPE LIBVERSION BUILD HOST TARGET ;
(( TODO: Add the rest of .pc files here.. add a rule for acr? ))

View File

@ -13,9 +13,7 @@ OBJS+=vmenus.o vmenus_graph.o vmenus_zigns.o zdiff.o citem.o
OBJS+=task.o panels.o pseudo.o vmarks.o anal_tp.o anal_objc.o blaze.o cundo.o
CFLAGS+=-I../../shlr/heap/include
ifeq ($(USE_TREESITTER),1)
CFLAGS+=-I../../shlr/tree-sitter/lib/include -I../../shlr/radare2-shell-parser/src/tree_parser
endif
CFLAGS+=-DR2_PLUGIN_INCORE -I../../shlr
LDFLAGS+=${DL_LIBS}
@ -43,9 +41,7 @@ OBJS+=$(STATIC_OBJS)
#STATIC_OBJS=$(subst ..,p/..,$(subst core_,p/core_,$(STATIC_OBJ)))
include $(TOP)/shlr/gdb/deps.mk
ifeq ($(USE_TREESITTER),1)
include $(TOP)/shlr/radare2-shell-parser-deps.mk
endif
include $(LTOP)/rules.mk
# include plugins

View File

@ -27,20 +27,17 @@
#include <sys/utsname.h>
#endif
#include <tree_sitter/api.h>
TSLanguage *tree_sitter_r2cmd ();
// NOTE: this should be in sync with SPECIAL_CHARACTERS in
// radare2-shell-parser grammar, except for ", ' and
// whitespaces, because we let cmd_substitution_arg create
// new arguments
static const char *SPECIAL_CHARS_REGULAR = "@;~$#|`\"'()<>";
#if USE_TREESITTER
static const char *SPECIAL_CHARS_PF = "@;~$#|`\"'<>";
#include <tree_sitter/api.h>
TSLanguage *tree_sitter_r2cmd ();
static const char *SPECIAL_CHARS_DOUBLE_QUOTED = "\"";
static const char *SPECIAL_CHARS_SINGLE_QUOTED = "'";
#endif
R_API void r_save_panels_layout(RCore *core, const char *_name);
R_API bool r_load_panels_layout(RCore *core, const char *_name);
@ -4542,8 +4539,6 @@ out_finish:
static int run_cmd_depth(RCore *core, char *cmd);
#if USE_TREESITTER
struct tsr2cmd_state {
TSParser *parser;
RCore *core;
@ -5085,6 +5080,7 @@ DEFINE_HANDLE_TS_FCN(redirect_command) {
TSNode redirect_op = ts_node_child_by_field_name (node, "redirect_operator", strlen ("redirect_operator"));
if (is_ts_fdn_redirect_operator (redirect_op)) {
// this is the default operation, no html and no append
} else if (is_ts_fdn_append_operator (redirect_op)) {
is_append = true;
} else if (is_ts_html_redirect_operator (redirect_op)) {
@ -5405,7 +5401,7 @@ DEFINE_HANDLE_TS_FCN(tmp_reli_command) {
ut64 orig_offset = state->core->offset;
ut64 addr = r_num_math (core->num, arg_str);
if (addr) {
r_core_cmdf (core, "so %d", addr);
r_core_cmdf (core, "so %" PFMT64d, addr);
}
bool res = handle_ts_command_tmpseek (state, command);
r_core_seek (state->core, orig_offset, true);
@ -5435,7 +5431,7 @@ DEFINE_HANDLE_TS_FCN(tmp_fd_command) {
TSNode command = ts_node_named_child (node, 0);
TSNode arg = ts_node_named_child (node, 1);
char *arg_str = ts_node_handle_arg (state, node, arg, 1);
int tmpfd = core->io->desc ? core->io->desc->fd : -1;
int tmpfd = core->io->desc? core->io->desc->fd: -1;
r_io_use_fd (core->io, atoi (arg_str));
bool res = handle_ts_command (state, command);
r_io_use_fd (core->io, tmpfd);
@ -6522,7 +6518,7 @@ DEFINE_HANDLE_TS_FCN(commands) {
#define HANDLER_RULE_OP(name) { #name, handle_ts_##name },
#define RULE_OP(name)
struct ts_data_symbol_map map[] = {
struct ts_data_symbol_map map_ts_command_handlers[] = {
#include "r2-shell-parser-cmds.inc"
{ NULL, NULL },
};
@ -6530,7 +6526,7 @@ struct ts_data_symbol_map map[] = {
#define RULE_OP(name) { #name, &ts_##name##_symbol },
#define HANDLER_RULE_OP(name) RULE_OP(name)
struct ts_data_symbol_map map_symbols[] = {
struct ts_data_symbol_map map_ts_symbols[] = {
#include "r2-shell-parser-cmds.inc"
{ NULL, NULL },
};
@ -6542,14 +6538,14 @@ static void ts_symbols_init(RCmd *cmd) {
TSLanguage *lang = tree_sitter_r2cmd ();
cmd->language = lang;
cmd->ts_symbols_ht = ht_up_new0 ();
struct ts_data_symbol_map *entry = map;
struct ts_data_symbol_map *entry = map_ts_command_handlers;
while (entry->name) {
TSSymbol symbol = ts_language_symbol_for_name (lang, entry->name, strlen (entry->name), true);
ht_up_insert (cmd->ts_symbols_ht, symbol, entry->data);
entry++;
}
entry = map_symbols;
entry = map_ts_symbols;
while (entry->name) {
TSSymbol *sym_ptr = entry->data;
*sym_ptr = ts_language_symbol_for_name (lang, entry->name, strlen (entry->name), true);
@ -6608,7 +6604,6 @@ static bool core_cmd_tsr2cmd(RCore *core, const char *cstr, bool split_lines, bo
core->cons->context->cmd_depth++;
return res;
}
#endif
static int run_cmd_depth(RCore *core, char *cmd) {
char *rcmd;
@ -6640,11 +6635,7 @@ static int run_cmd_depth(RCore *core, char *cmd) {
R_API int r_core_cmd(RCore *core, const char *cstr, int log) {
if (core->use_tree_sitter_r2cmd) {
#if USE_TREESITTER
return core_cmd_tsr2cmd (core, cstr, false, log)? 0: 1;
#else
R_LOG_WARN ("No compilation support for radare2-shell-parser\n");
#endif
}
int ret = false, i;

View File

@ -41,9 +41,7 @@ R_API RCmd *r_cmd_free(RCmd *cmd) {
if (!cmd) {
return NULL;
}
#if USE_TREESITTER
ht_up_free (cmd->ts_symbols_ht);
#endif
r_cmd_alias_free (cmd);
r_cmd_macro_fini (&cmd->macro);
// dinitialize plugin commands

View File

@ -76,10 +76,8 @@ typedef struct r_cmd_t {
RList *lcmds;
RList *plist;
RCmdAlias aliases;
#if USE_TREESITTER
void *language; // used to store TSLanguage *
HtUP *ts_symbols_ht;
#endif
} RCmd;
// TODO WIP

View File

@ -98,8 +98,6 @@
#define USE_PTRACE_WRAP @USE_PTRACE_WRAP@
#define HAVE_FORK @HAVE_FORK@
#define USE_TREESITTER @USE_TREESITTER@
#define WITH_GPL @WITH_GPL@
#if __APPLE__ && __POWERPC__

View File

@ -343,7 +343,6 @@ userconf.set10('HAVE_FORK', true)
userconf.set10('HAVE_PTRACE', have_ptrace)
userconf.set10('USE_PTRACE_WRAP', use_ptrace_wrap)
userconf.set10('WITH_GPL', true)
userconf.set10('USE_TREESITTER', get_option('use_treesitter'))
ok = cc.has_header_symbol('sys/personality.h', 'ADDR_NO_RANDOMIZE')
userconf.set10('HAVE_DECL_ADDR_NO_RANDOMIZE', ok)

View File

@ -27,13 +27,11 @@ option('use_sys_lz4', type: 'boolean', value: false)
option('use_sys_xxhash', type: 'boolean', value: false)
option('use_sys_openssl', type: 'boolean', value: false)
option('use_libuv', type: 'boolean', value: true)
option('use_treesitter', type: 'boolean', value: false)
option('debugger', type: 'boolean', value: true)
option('use_webui', type: 'boolean', value: false, description: 'install different WebUIs for radare2')
option('shell_parser_in_builddir', type: 'boolean', value: true, description: 'When true, radare2-shell-parser is downloaded in the build directory')
option('tree_sitter_in_builddir', type: 'boolean', value: true, description: 'When true, tree-sitter is downloaded in the build directory')
option('enable_tests', type: 'boolean', value: false, description: 'Build unit tests in test/unit')
option('enable_r2r', type: 'boolean', value: true, description: 'Build r2r executable for regression testing')
option('enable_r2r', type: 'boolean', value: true, description: 'Build r2r executable for regression testing')
option('tree-sitter-sync', type: 'boolean', value: false, description: 'Force a sync of shlr/tree-sitter before building')

View File

@ -31,11 +31,6 @@ TS_URL=https://github.com/tree-sitter/tree-sitter.git
TS_BRA=master
TS_TIP=f049ba350f3f6019ce9a1cbb0975ebd154ef7ad3
# NOTE: when you update SHELLPARSER_TIP or SHELLPARSER_BRA, also update them in shlr/meson.build
SHELLPARSER_URL=https://github.com/ret2libc/radare2-shell-parser.git
SHELLPARSER_BRA=master
SHELLPARSER_TIP=3d82cad9d865cb6e65364f66f038d1d1d4d8818a
ifeq ($(CS_RELEASE),1)
CS_VER=4.0.1
CS_TAR=https://codeload.github.com/aquynh/capstone/tar.gz/$(CS_VER)
@ -65,7 +60,7 @@ CS_REV=
CS_PATCHES=1
endif
.PHONY: capstone-sync capstone-build all clean mrproper libgdbr libwindbg bochs tree-sitter-sync radare2-shell-parser-sync
.PHONY: capstone-sync capstone-build all clean mrproper libgdbr libwindbg bochs tree-sitter-sync
HOST_CC?=gcc
SHLR?=$(shell pwd)
@ -329,37 +324,37 @@ else
cd ../../radare2-webui/www/m && git pull ; npm i ; $(MAKE) release
endif
ifeq ($(USE_TREESITTER),1)
tree-sitter-build: tree-sitter/libtree-sitter.$(EXT_AR)
tree-sitter/libtree-sitter.$(EXT_AR): tree-sitter/lib/src/lib.o
$(AR) rvs $@ $<
$(RANLIB) $@
tree-sitter/lib/src/lib.o: tree-sitter-sync
tree-sitter/lib/src/lib.o:
$(CC) -std=c99 -c tree-sitter/lib/src/lib.c -o $@ -Itree-sitter/lib/include -Itree-sitter/lib/src $(CFLAGS)
tree-sitter-sync:
"$(SHELL)" clone_3rd_repo.sh tree-sitter "${TS_URL}" "${TS_BRA}" "${TS_TIP}"
rm -rf tree-sitter tree-sitter.vc
"$(SHELL)" clone_3rd_repo.sh tree-sitter.vc "${TS_URL}" "${TS_BRA}" "${TS_TIP}"
mkdir -p tree-sitter/lib
cp -rf ./tree-sitter.vc/lib/src ./tree-sitter/lib
cp -rf ./tree-sitter.vc/lib/include ./tree-sitter/lib
rm -rf tree-sitter.vc
radare2-shell-parser-build: radare2-shell-parser/libshell-parser.$(EXT_AR)
radare2-shell-parser/libshell-parser.$(EXT_AR): radare2-shell-parser/src/parser.o
$(AR) rvs $@ $<
radare2-shell-parser/libshell-parser.$(EXT_AR): radare2-shell-parser/src/parser.o radare2-shell-parser/src/scanner.o
$(AR) rvs $@ radare2-shell-parser/src/parser.o radare2-shell-parser/src/scanner.o
$(RANLIB) $@
radare2-shell-parser/src/parser.o: radare2-shell-parser-sync
radare2-shell-parser/src/parser.o: radare2-shell-parser/src/parser.c
$(CC) -c radare2-shell-parser/src/parser.c -o $@ -Iradare2-shell-parser/src/tree_sitter -Itree-sitter/lib/include $(CFLAGS)
radare2-shell-parser-sync: tree-sitter-sync
"$(SHELL)" clone_3rd_repo.sh radare2-shell-parser "${SHELLPARSER_URL}" "${SHELLPARSER_BRA}" "${SHELLPARSER_TIP}"
radare2-shell-parser/src/scanner.o: radare2-shell-parser/src/scanner.c
$(CC) -c radare2-shell-parser/src/scanner.c -o $@ -Iradare2-shell-parser/src/tree_sitter -Itree-sitter/lib/include $(CFLAGS)
SHLRS+=tree-sitter/libtree-sitter.a
SHLRS+=radare2-shell-parser/libshell-parser.a
else
tree-sitter-build:
radare2-shell-parser-build:
endif
www-sync-m sync-www-m: ../../radare2-webui/dist/m
cp -rf ../../radare2-webui/dist/m www/m.tmp

View File

@ -232,109 +232,79 @@ sdb_gen_cmd = [
'@INPUT@'
]
if get_option('use_treesitter')
# handle tree-sitter dependency
if get_option('tree_sitter_in_builddir')
tree_sitter_path = join_paths(meson.current_build_dir(), 'tree-sitter')
else
tree_sitter_path = join_paths(meson.current_source_dir(), 'tree-sitter')
endif
res = run_command(py3_exe, '-c', '__import__("sys").exit(__import__("os").path.exists("@0@"))'.format(tree_sitter_path))
if res.returncode() == 0
if not git_exe.found()
error('Cannot load tree-sitter library. Either provide tree-sitter in ./shlr/tree-sitter or install git, so it can be downloaded')
endif
# NOTE: when you update TS_TIP or TS_BRA, also update them in shlr/Makefile
TS_TIP = 'f049ba350f3f6019ce9a1cbb0975ebd154ef7ad3'
TS_BRA = 'master'
message('Cloning tree-sitter ' + TS_BRA + ' branch, commit ' + TS_TIP + ', into ' + tree_sitter_path)
git_cmd = 'clone -b @0@ https://github.com/tree-sitter/tree-sitter.git @1@'.format(TS_BRA, tree_sitter_path)
clone_cmd = run_command(git_exe, git_cmd.split())
if clone_cmd.returncode() != 0
error('Cannot execute git clone command')
endif
reset_cmd_str = '-C @0@ reset --hard @1@'.format(tree_sitter_path, TS_TIP)
reset_cmd = run_command(git_exe, reset_cmd_str.split())
if reset_cmd.returncode() != 0
error('Cannot execute git reset command')
endif
# handle tree-sitter
tree_sitter_path = join_paths(meson.current_source_dir(), 'tree-sitter')
tree_sitter_vc_path = join_paths(meson.current_source_dir(), 'tree-sitter.vc')
if get_option('tree-sitter-sync')
if not git_exe.found()
error('Cannot sync tree-sitter library. Either provide tree-sitter in ./shlr/tree-sitter or install git, so it can be downloaded')
endif
tree_sitter_files = [
join_paths(tree_sitter_path, 'lib/src/lib.c'),
]
# NOTE: when you update TS_TIP or TS_BRA, also update them in shlr/Makefile
TS_TIP = 'f049ba350f3f6019ce9a1cbb0975ebd154ef7ad3'
TS_BRA = 'master'
tree_sitter_inc = [platform_inc, include_directories('tree-sitter/lib/src'), include_directories('tree-sitter/lib/include')]
libtree_sitter = static_library('tree_sitter', tree_sitter_files,
include_directories: tree_sitter_inc,
implicit_include_directories: false,
c_args: ['-std=c99']
)
tree_sitter_dep = declare_dependency(
link_with: libtree_sitter,
include_directories: tree_sitter_inc
)
# handle radare2-shell-parser dependency
if get_option('shell_parser_in_builddir')
shell_parser_path = join_paths(meson.current_build_dir(), 'radare2-shell-parser')
else
shell_parser_path = join_paths(meson.current_source_dir(), 'radare2-shell-parser')
endif
res = run_command(py3_exe, '-c', '__import__("sys").exit(__import__("os").path.exists("@0@"))'.format(shell_parser_path))
if res.returncode() == 0
if not git_exe.found()
error('Cannot load radare2-shell-parser library. Either provide radare2-shell-parser in ./shlr/radare2-shell-parser or install git, so it can be downloaded')
endif
# NOTE: when you update SHELLPARSER_TIP or SHELLPARSER_BRA, also update them in shlr/Makefile
SHELLPARSER_TIP = '3d82cad9d865cb6e65364f66f038d1d1d4d8818a'
SHELLPARSER_BRA = 'master'
shell_parser_user = 'ret2libc'
message('Cloning radare2-shell-parser ' + SHELLPARSER_BRA + ' branch, commit ' + SHELLPARSER_TIP + ', into ' + shell_parser_path)
git_cmd = 'clone -b @0@ https://github.com/@1@/radare2-shell-parser.git @2@'.format(SHELLPARSER_BRA, shell_parser_user, shell_parser_path)
clone_cmd = run_command(git_exe, git_cmd.split())
if clone_cmd.returncode() != 0
error('Cannot execute git clone command')
endif
reset_cmd_str = '-C @0@ reset --hard @1@'.format(shell_parser_path, SHELLPARSER_TIP)
reset_cmd = run_command(git_exe, reset_cmd_str.split())
if reset_cmd.returncode() != 0
error('Cannot execute git reset command')
endif
message('Deleting existing directories @0@ and @1@'.format(tree_sitter_vc_path, tree_sitter_path))
res = run_command('rm', '-rf @0@ @1@'.format(tree_sitter_vc_path, tree_sitter_path).split())
message('Cloning tree-sitter ' + TS_BRA + ' branch, commit ' + TS_TIP + ', into ' + tree_sitter_vc_path)
git_cmd = 'clone -b @0@ https://github.com/tree-sitter/tree-sitter.git @1@'.format(TS_BRA, tree_sitter_vc_path)
clone_cmd = run_command(git_exe, git_cmd.split())
if clone_cmd.returncode() != 0
error('Cannot execute git clone command')
endif
shell_parser_files = [
join_paths(shell_parser_path, 'src/parser.c'),
join_paths(shell_parser_path, 'src/scanner.c'),
]
reset_cmd_str = '-C @0@ reset --hard @1@'.format(tree_sitter_vc_path, TS_TIP)
reset_cmd = run_command(git_exe, reset_cmd_str.split())
if reset_cmd.returncode() != 0
error('Cannot execute git reset command')
endif
shell_parser_inc = [platform_inc, include_directories('radare2-shell-parser/src/tree_sitter')]
libshell_parser = static_library('shell_parser', shell_parser_files,
include_directories: shell_parser_inc + tree_sitter_inc,
implicit_include_directories: true
)
shell_parser_dep = declare_dependency(
link_with: libshell_parser,
include_directories: shell_parser_inc,
dependencies: tree_sitter_dep
)
else
shell_parser_dep = []
tree_sitter_dep = []
message('Copying files from @0@ to @1@'.format(tree_sitter_vc_path, tree_sitter_path))
res = run_command('mkdir', '-p @0@/lib'.format(tree_sitter_path).split())
res = run_command('cp', '-r @0@/lib/src @1@/lib'.format(tree_sitter_vc_path, tree_sitter_path).split())
res = run_command('cp', '-r @0@/lib/include @1@/lib'.format(tree_sitter_vc_path, tree_sitter_path).split())
message('Deleting @0@'.format(tree_sitter_vc_path))
res = run_command('rm', '-rf @0@'.format(tree_sitter_vc_path).split())
endif
tree_sitter_files = [
join_paths(tree_sitter_path, 'lib/src/lib.c'),
]
tree_sitter_inc = [platform_inc, include_directories('tree-sitter/lib/src'), include_directories('tree-sitter/lib/include')]
libtree_sitter = static_library('tree_sitter', tree_sitter_files,
include_directories: tree_sitter_inc,
implicit_include_directories: false,
c_args: ['-std=c99']
)
tree_sitter_dep = declare_dependency(
link_with: libtree_sitter,
include_directories: tree_sitter_inc
)
# new radare2 shell parser
shell_parser_path = join_paths(meson.current_source_dir(), 'radare2-shell-parser')
shell_parser_files = [
join_paths(shell_parser_path, 'src/parser.c'),
join_paths(shell_parser_path, 'src/scanner.c'),
]
shell_parser_inc = [platform_inc, include_directories('radare2-shell-parser/src/tree_sitter')]
libshell_parser = static_library('shell_parser', shell_parser_files,
include_directories: shell_parser_inc + tree_sitter_inc,
implicit_include_directories: true
)
shell_parser_dep = declare_dependency(
link_with: libshell_parser,
include_directories: shell_parser_inc,
dependencies: tree_sitter_dep
)
# handle bochs dependency
bochs_files = [

1
shlr/radare2-shell-parser/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
node_modules

View File

@ -0,0 +1,26 @@
# radare2-shell-parser
This is the parser for radare2 shell language.
See https://tree-sitter.github.io/tree-sitter/creating-parsers for more info on
how to create a parser with tree-sitter.
## Sources
- grammar.js: defines the basic grammar
- src/scanner.c: external scanner used to scan some tokens that cannot be parsed
with the regular js grammar, as that includes only the context-free part of
the language.
- src/parser.c: this file is auto-generated by tree-sitter based on grammar.js
- corpus/\*: list of test files used to ensure the grammar works well
## How to update grammar
When you update something in grammar.js or src/scanner.c you have to re-generate
the parser. The process works as follows:
1. Do changes as needed to grammar.js and/or src/scanner.c
2. Install npm dependencies with: `cd shlr/radare2-shell-parser ; npm install`
3. Make sure tree-sitter is in PATH: `export PATH=$PATH:./node_modules/.bin`
4. Re-generate the parser files: `tree-sitter generate`
5. Check tests still pass: `tree-sitter test`. Use `tree-sitter parse
./example-file` if you want to see the syntax tree of a custom input provided
in `./example-file`
6. Commit auto-generated files and modified grammar.js and src/scanner.c into git.

View File

@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_r2cmd_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"src/parser.c",
"src/scanner.c",
"src/binding.cc"
],
"cflags_c": [
"-std=c99 -ggdb -O0",
]
}
]
}

View File

@ -0,0 +1,67 @@
=======================================
Command substitution used as simple arg $(
=======================================
?e $(p8 10)
---
(commands
(arged_command command: (cmd_identifier)
args: (args
(arg (cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier)))))))))
=======================================
Command substitution with multiple commands
=======================================
?e $(p8 10; p8 4 @ 0xdeadbeef)
---
(commands
(arged_command command: (cmd_identifier)
args: (args
(arg (cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(tmp_seek_command
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(arg (arg_identifier))))))))
=======================================
Command substitution used as simple arg `
=======================================
?e `p8 10`
---
(commands
(arged_command command: (cmd_identifier)
args: (args
(arg (cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier)))))))))
=======================================
Nested command substitution
=======================================
?e $(p8 $(?e 10))
---
(commands
(arged_command command: (cmd_identifier)
args: (args (arg (cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args (arg (cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier)))))))))))))

View File

@ -0,0 +1,85 @@
=======================
One command and comment
=======================
afl # af is not going to be parsed
---
(commands
(arged_command (cmd_identifier)))
==================
Begin with comment
==================
# this is just a comment
# ~?
# boh
---
(commands)
====================================
Multiple commands multiple comments
====================================
afl # first comment
p8 10 # second comment
p8 10# third comment
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
===========================
Multiline comment on a line
===========================
p8 /* inline comment */ 3
afl
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)))
=====================================
Multiline comment on multiple lines
=====================================
p8 /* multiline
comment */ 3
afl
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)))
==================================
Comment with # in the middle line
==================================
p8 4 # something
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))

View File

@ -0,0 +1,40 @@
============================
Escape special command chars
============================
pd 10\@test\>name
---
(commands
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier)))))
====================
Use newlines in echo
====================
?e "Hello\nWorld"
---
(commands
(arged_command
command: (cmd_identifier)
args: (args (arg (double_quoted_arg)))))
===========
Escape hash
===========
?e Hello\#World
---
(commands
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier)))))

View File

@ -0,0 +1,221 @@
=================
Foreach addr+size
=================
pd @@@=0xdeadbeef 10
pd @@@=0xdeadbeef 10 $$ 20
---
(commands
(foreach_addrsize_command
(arged_command (cmd_identifier))
(arg (arg_identifier))
(arg (arg_identifier)))
(foreach_addrsize_command
(arged_command (cmd_identifier))
(arg (arg_identifier))
(arg (arg_identifier))
(arg (arg_identifier))
(arg (arg_identifier))))
===================
Foreach basic block
===================
pd @@@b
---
(commands
(foreach_bb_command
(arged_command (cmd_identifier))))
===============
Foreach command
===============
pd @@@c:?e hello
---
(commands
(foreach_cmd_command
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
===============
Foreach comment
===============
pd @@@C:comment
---
(commands
(foreach_comment_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
==============
Foreach import
==============
pd @@@i
---
(commands
(foreach_import_command
(arged_command (cmd_identifier))))
================
Foreach register
================
pd @@@r
---
(commands
(foreach_register_command
(arged_command (cmd_identifier))))
==============
Foreach symbol
==============
pd @@@s
---
(commands
(foreach_symbol_command
(arged_command (cmd_identifier))))
==============
Foreach string
==============
pd @@@st
---
(commands
(foreach_string_command
(arged_command (cmd_identifier))))
===============
Foreach section
===============
pd @@@S
---
(commands
(foreach_section_command
(arged_command (cmd_identifier))))
===============
Foreach io.maps
===============
pd @@@m
---
(commands
(foreach_iomap_command
(arged_command (cmd_identifier))))
===============
Foreach dbg.map
===============
pd @@@M
---
(commands
(foreach_dbgmap_command
(arged_command (cmd_identifier))))
============
Foreach flag
============
pd @@@f
---
(commands
(foreach_flag_command
(arged_command (cmd_identifier))))
==================
Foreach flag match
==================
pd @@@f:hit*
---
(commands
(foreach_flag_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
================
Foreach function
================
pd @@@F
---
(commands
(foreach_function_command
(arged_command (cmd_identifier))))
================
Foreach function match
================
pd @@@F:hit*
---
(commands
(foreach_function_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
==============
Foreach thread
==============
pd @@@t
---
(commands
(foreach_thread_command
(arged_command (cmd_identifier))))

View File

@ -0,0 +1,60 @@
================
Very simple grep
================
pd 10 ~ mov
pd 10~mov
afl~$
---
(commands
(grep_command
command: (arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
specifier: (grep_specifier (grep_specifier_identifier)))
(grep_command
command: (arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
specifier: (grep_specifier (grep_specifier_identifier)))
(grep_command
command: (arged_command command: (cmd_identifier))
specifier: (grep_specifier (grep_specifier_identifier))))
=======================================
Grep with cmd substitution as specifier
=======================================
pd 10~`?e mov`
pd 10~mo`?e v`
pd 10~mo$(?e v)
---
(commands
(grep_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(grep_specifier
(cmd_substitution_arg
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))))
(grep_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(grep_specifier
(grep_specifier_identifier)
(cmd_substitution_arg
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))))
(grep_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(grep_specifier
(grep_specifier_identifier)
(cmd_substitution_arg
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))))

View File

@ -0,0 +1,205 @@
============
Iter flags
============
p8 4 @@ sym.*
---
(commands
(iter_flags_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
========
Iter dbt
========
p8 4 @@dbt
p8 4 @@dbta
p8 4 @@dbtb
p8 4 @@dbts
---
(commands
(iter_dbta_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(iter_dbta_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(iter_dbtb_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(iter_dbts_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
===============
Iter file lines
===============
p8 @@.file
---
(commands
(iter_file_lines_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
===============
Iter offsets
===============
p8 @@=off1 off2
---
(commands
(iter_offsets_command
(arged_command (cmd_identifier))
(args (arg (arg_identifier)) (arg (arg_identifier)))))
===============
Iter sdbquery
===============
p8 @@k sdbquery
---
(commands
(iter_sdbquery_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
===============
Iter threads
===============
p8 @@t
---
(commands
(iter_threads_command
(arged_command (cmd_identifier))))
===============
Iter basic blocks
===============
p8 @@b
---
(commands
(iter_bbs_command
(arged_command (cmd_identifier))))
===============
Iter instructions
===============
p8 @@i
---
(commands
(iter_instrs_command
(arged_command (cmd_identifier))))
===============
Iter sections
===============
p8 @@iS
---
(commands
(iter_sections_command
(arged_command (cmd_identifier))))
===============
Iter functions
===============
p8 @@f
---
(commands
(iter_functions_command
(arged_command (cmd_identifier))))
===============
Iter function matching
===============
p8 @@f:write
---
(commands
(iter_functions_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
===============
Iter by step
===============
p8 @@s:0xd000 0xe000 0x100
---
(commands
(iter_step_command
(arged_command (cmd_identifier))
(arg (arg_identifier))
(arg (arg_identifier))
(arg (arg_identifier))))
===============
Iter command
===============
p8 @@c:/x 9090
---
(commands
(iter_interpret_command
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
================
Iter search hits
================
pd 2 @@/x 9090
---
(commands
(iter_hit_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))

View File

@ -0,0 +1,230 @@
==========================
Show data with format_name
==========================
pf fmt_name
pf* fmt_name
pfc fmt_name
pfj fmt_name
pfq fmt_name
pfs fmt_name
---
(commands
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args (pf_arg (pf_arg_identifier)))))
================================
Show data with format definition (simple)
================================
pf 3xi foo bar
pf* 3xi foo bar
pfc 3xi foo bar
pfj 3xi foo bar
pfq 3xi foo bar
pfs 3xi foo bar
---
(commands
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))))
===============
pf. format name
===============
pf.fmt_name
---
(commands
(arged_command (cmd_identifier)
(pf_dot_cmd_args (pf_args (pf_arg (pf_arg_identifier))))))
=================
pf. format fields
=================
pf.fmt_name.field_name
pf.fmt_name.field_name=33
pf.fmt_name.field_name[3]
---
(commands
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))))
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))
(pf_arg_identifier)
(pf_args (pf_arg (pf_arg_identifier)))))
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))))
=================
pfv format fields
=================
pfv.fmt_name
pfv.fmt_name.field_name
pfv.fmt_name.field_name=0xdeadbeef
---
(commands
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier)))))
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))))
(arged_command (cmd_identifier)
(pf_dot_cmd_args
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))
(pf_arg_identifier)
(pf_args (pf_arg (pf_arg_identifier))))))
======================
pf.fmt_name definition
======================
pf.obj xxdz prev next size name
---
(commands
(arged_command (cmd_identifier)
(pf_new_args
(pf_arg (pf_arg_identifier))
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))))))
=======
List pf
=======
pf.
---
(commands
(arged_command (cmd_identifier)))
=================
Pf load from file
=================
pfo /tmp/myfile
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
==========================
pf format with parentheses
==========================
pf B (BitFldType)arg_name
---
(commands
(arged_command (cmd_identifier)
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg
(pf_concatenation
(pf_arg_identifier)
(pf_args (pf_arg (pf_arg_identifier)))
(pf_arg_identifier)
(pf_arg_identifier))))))
==========
Cf example
==========
Cf 64 [2]zwww e_magic e_cblp e_cp e_crlc e_cparhdr
Cf-
---
(commands
(arged_command (cmd_identifier)
(args
(arg (arg_identifier))
(pf_args
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier))
(pf_arg (pf_arg_identifier)))))
(arged_command (cmd_identifier)))

View File

@ -0,0 +1,94 @@
======================
Disable html and color
======================
p8 10 |
---
(commands
(html_disable_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
======================
Enable html and color
======================
p8 10 |H
---
(commands
(html_enable_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
======================
Use sctr.ts
======================
p8 10 |T
---
(commands
(scr_tts_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
======================
Pipe to another command
======================
p8 10 | grep 10
---
(commands
(pipe_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(pipe_second_command)))
===========
Double pipe
===========
pd 10 | cat | grep mov
---
(commands
(pipe_command
(pipe_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(pipe_second_command))
(pipe_second_command)))
===================================
Pipe to another command with no arg
===================================
f | grep main
---
(commands
(pipe_command
(arged_command (cmd_identifier))
(pipe_second_command)))
=======================
Pipe to interpreter "."
=======================
p8 10 |.
---
(commands
(arged_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))

View File

@ -0,0 +1,53 @@
==============================
Echo with (double) quoted args
==============================
?e "This;is.one@string"
---
(commands
(arged_command command: (cmd_identifier)
args: (args (arg (double_quoted_arg)))))
==============================
Echo with (single) quoted args
==============================
?e 'This;is.one@string'
---
(commands
(arged_command command: (cmd_identifier)
args: (args (arg (single_quoted_arg)))))
===========================
Legacy command - all quoted
===========================
"?e This;is.one@string"
---
(commands
(legacy_quoted_command))
=======================================
Double quoted arg with cmd substitution
=======================================
?e "This is $(?e "a command")"
---
(commands
(arged_command command: (cmd_identifier)
args: (args
(arg (double_quoted_arg
(cmd_substitution_arg
(arged_command command: (cmd_identifier)
args: (args
(arg (double_quoted_arg))))))))))

View File

@ -0,0 +1,73 @@
===============
Redirect stdout
===============
afl > /tmp/test.txt
---
(commands
(redirect_command
(arged_command (cmd_identifier))
(fdn_redirect_operator)
(arg (arg_identifier))))
===============
Redirect stderr
===============
afl 2> /tmp/test.txt
---
(commands
(redirect_command
(arged_command (cmd_identifier))
(fdn_redirect_operator (file_descriptor))
(arg (arg_identifier))))
=============
Redirect HTML
=============
afl H> /tmp/test.txt
---
(commands
(redirect_command
(arged_command (cmd_identifier))
(html_redirect_operator)
(arg (arg_identifier))))
===============
Append to file
===============
afl >> /tmp/test.txt
---
(commands
(redirect_command
(arged_command (cmd_identifier))
(fdn_append_operator)
(arg (arg_identifier))))
===============
Append err to file
===============
afl 2>> /tmp/test.txt
---
(commands
(redirect_command
(arged_command (cmd_identifier))
(fdn_append_operator (file_descriptor))
(arg (arg_identifier))))

View File

@ -0,0 +1,63 @@
=========================
One digit repeat commands
=========================
7/x 90
---
(commands
(repeat_command
(number)
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
=========================
Multiple digits repeat commands
=========================
17/x 90
---
(commands
(repeat_command
(number)
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))
=======================
Repeat with redirection
=======================
2p8 8 > /tmp/out.txt
---
(commands
(redirect_command
(repeat_command
(number)
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(fdn_redirect_operator)
(arg (arg_identifier))))
====================
Repeat with tmp seek
====================
2p8 8 @ 0xdeadbeef
---
(commands
(tmp_seek_command
(repeat_command
(number)
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(arg (arg_identifier))))

View File

@ -0,0 +1,53 @@
=============
Simple search
=============
/ "foo\x00"
---
(commands
(arged_command (cmd_identifier)
(args (arg (double_quoted_arg)))))
==============
Not matching
==============
/! ff
/!x 00
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
=======
Reg exp
=======
/e /E.F/i
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
===================
Search with nibbles
===================
/x ff..33
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))

View File

@ -0,0 +1,245 @@
================
No arguments
================
afl
afl
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)))
=============
One argument
=============
af 0xdeadbeef
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
==============================
Semi-colon separated commands
==============================
afl;af 0xdeadbeef ; afl
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)))
=========================
Newline separate commands
=========================
afl
af 0xdeadbeef
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
======================
Question mark commands
======================
? x
??
???
?t cmd
?x hello
?:
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)))
===============
Search commands
===============
/x 90
/v4 0xdead
---
(commands
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
========================
Different output formats
========================
afl*
afl+
aflj
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)))
=============
Help commands
=============
?
a?
$?
*?
afl?
(?
=?
/v?
/v4?
|?
.?
./?
@?
@@?
@@@?
~?
?*
p?*
&?
pf??
pf???
---
(commands
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier))
(help_command (cmd_identifier)))
===============================
Command with concatenation args
===============================
?e Hello" World"'!' And All
---
(commands
(arged_command (cmd_identifier)
(args
(arg
(concatenation
(arg_identifier)
(double_quoted_arg)
(single_quoted_arg)))
(arg (arg_identifier))
(arg (arg_identifier)))))
==============
Number command
==============
0x8048000
0b1001
0xtest
0123123
0x123command
---
(commands
(number_command)
(number_command)
(number_command) (ERROR)
(number_command)
(number_command) (ERROR))
=============
Tasks command
=============
& ?e Hello
& ?e Hello
&& 1
&=
---
(commands
(task_command (cmd_identifier)
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(task_command (cmd_identifier)
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))
(task_command (cmd_identifier)
(args (arg (arg_identifier))))
(task_command (cmd_identifier)))
==============
Arg with (...)
==============
?e Hello(World)
---
(commands
(arged_command (cmd_identifier)
(args
(arg
(concatenation
(arg_identifier)
(arg_identifier)
(args (arg (arg_identifier)))
(arg_identifier))))))

View File

@ -0,0 +1,256 @@
======================
Pointer type commands
======================
*entry0
*entry0=cc
*entry0+10=cc
---
(commands
(arged_command (cmd_identifier)
(args
(args (arg (arg_identifier)))))
(arged_command (cmd_identifier)
(args
(args (arg (arg_identifier)))
(arg_identifier)
(args (arg (arg_identifier)))))
(arged_command (cmd_identifier)
(args
(args (arg (arg_identifier)))
(arg_identifier)
(args (arg (arg_identifier))))))
==============================
Environment variable command
==============================
%
%SHELL
%TMPDIR=/tmp
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args
(args (arg (arg_identifier)))))
(arged_command (cmd_identifier)
(args
(args (arg (arg_identifier)))
(arg_identifier)
(args (arg (arg_identifier))))))
===============
Macro commands
===============
(_foo x y; p8 $0 @ $1)
(-foo)
(foo x y; p8 $0 @ $1)(10 0x10)
(pdstr bits; s $0; .(pdenc guess); .(pdenc utf$0le); .(pdenc utf$0be))
(foo; pd 10~this is special\))
(
(*
---
(commands
(arged_command (cmd_identifier)
(macro_args
(macro_content
(arg (arg_identifier))
(args
(arg (arg_identifier))
(arg (arg_identifier)))
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))))
(arged_command (cmd_identifier)
(macro_args
(macro_content
(arg (arg_identifier)))))
(arged_command (cmd_identifier)
(macro_args
(macro_content
(arg (arg_identifier))
(args
(arg (arg_identifier))
(arg (arg_identifier)))
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
(macro_call_full_content
(macro_call_content
(args
(arg (arg_identifier))
(arg (arg_identifier)))))))
(arged_command (cmd_identifier)
(macro_args
(macro_content
(arg (arg_identifier))
(args (arg (arg_identifier)))
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arged_command (cmd_identifier)
(macro_call_content
(args
(arg (arg_identifier))
(arg (arg_identifier)))))
(arged_command (cmd_identifier)
(macro_call_content
(args
(arg (arg_identifier))
(arg (arg_identifier)))))
(arged_command (cmd_identifier)
(macro_call_content
(args
(arg (arg_identifier))
(arg (arg_identifier))))))))
(arged_command (cmd_identifier)
(macro_args
(macro_content
(arg (arg_identifier))
(grep_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(grep_specifier (grep_specifier_identifier))))))
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)))
===============
System commands
===============
!
!ls
!!
!!ls
!=!
---
(commands
(arged_command (system_identifier))
(arged_command (system_identifier)
(args (arg (arg_identifier))))
(arged_command (system_identifier))
(arged_command (system_identifier)
(args (arg (arg_identifier))))
(arged_command (system_identifier)))
===================
Interpret r2 commands
===================
.cmd a1
.. myfile
. myfile.r2
.* file
.!rabin2 -ri $FILE
.(foo 1 2 3)
./ ELF
pd 10 |.
---
(commands
(arged_command
command: (cmd_identifier)
args: (arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier)))))
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(arged_command
command: (cmd_identifier)
args: (interpret_arg))
(arged_command
command: (cmd_identifier)
args: (macro_call_content
(args
(arg (arg_identifier))
(arg (arg_identifier))
(arg (arg_identifier))
(arg (arg_identifier)))))
(arged_command
command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(arged_command
args: (arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier)))))
)
============
Last cmd
============
.
...
---
(commands
(last_command (cmd_identifier))
(last_command (cmd_identifier)))
===================
Interpreter commands
===================
#!
#!python arg0
#!rust
#!?
---
(commands
(arged_command (cmd_identifier))
(arged_command (cmd_identifier)
(args
(arg (arg_identifier))
(arg (arg_identifier))))
(arged_command (cmd_identifier)
(args
(arg (arg_identifier))))
(help_command (cmd_identifier)))
=======================================
Pointer type commands with substitution
=======================================
*entr$(?e y0)=$(?v $$)
---
(commands
(arged_command (cmd_identifier)
(args
(args
(arg (arg_identifier))
(arg (cmd_substitution_arg
(arged_command (cmd_identifier)
(args (arg (arg_identifier)))))))
(arg_identifier)
(args
(arg (cmd_substitution_arg
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))))))))

View File

@ -0,0 +1,263 @@
======================
Temporary changes help
======================
@?
---
(commands
(help_command (cmd_identifier)))
==============
Temporary seek
==============
p8 10 @ 0xdeadbeef
p8 10 @ flag
---
(commands
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier)))
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
===================
Temporary blocksize
===================
p8 10 @! 30
---
(commands
(tmp_blksz_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
=================
Temporary from/to
=================
/x 9090 @{0xbeef 0xdead}
---
(commands
(tmp_fromto_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))
(arg (arg_identifier))))
=================
Temporary arch
=================
pd 2 @a:x86
---
(commands
(tmp_arch_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
=================
Temporary bits
=================
pd 2 @b:16
---
(commands
(tmp_bits_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
=================
Temporary seek to nth instr
=================
pd 2 @B:3
---
(commands
(tmp_nthi_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
=================
Temporary eval
=================
pd 2 @e:scr.utf8=false
pd 2 @e:asm.arch=x86,scr.utf8=true
---
(commands
(tmp_eval_command
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(tmp_eval_args (tmp_eval_arg)))
(tmp_eval_command
(arged_command command: (cmd_identifier)
args: (args (arg (arg_identifier))))
(tmp_eval_args
(tmp_eval_arg)
(tmp_eval_arg))))
=================
Temporary flagspace
=================
f @F:symbols
---
(commands
(tmp_fs_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary relative instruction
=================
pd 2 @i:4
---
(commands
(tmp_reli_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))
=================
Temporary seek to sdb key
=================
pd @k:key
---
(commands
(tmp_kuery_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary switch fd
=================
pd @o:3
---
(commands
(tmp_fd_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary seek to reg value
=================
pd @r:rax
---
(commands
(tmp_reg_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary file content
=================
pd @f:myfile
---
(commands
(tmp_file_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary string content
=================
pd @s:mystring
---
(commands
(tmp_string_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================
Temporary hex content
=================
pd @x:90deadbeef
---
(commands
(tmp_hex_command
(arged_command (cmd_identifier))
(arg (arg_identifier))))
=================================
2 Temporary changes + Redirection
=================================
p8 4 @ 0xdead @a:x86 > /tmp/out.txt
---
(commands
(redirect_command
(tmp_arch_command
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier)))
(arg (arg_identifier)))
(fdn_redirect_operator)
(arg (arg_identifier))))
=====================
Spaces after tmp_seek
=====================
wx 0x68a0a@entry0
---
(commands
(tmp_seek_command
(arged_command (cmd_identifier)
(args (arg (arg_identifier))))
(arg (arg_identifier))))

View File

@ -0,0 +1,699 @@
const SPECIAL_CHARACTERS = [
'\\s',
'@', '|', '#',
'"', '\'', '>',
';', '$', '`',
'~', '\\', ',',
'(', ')',
];
const PF_SPECIAL_CHARACTERS = [
'\\s',
'@', '|', '#',
'"', '\'', '>',
';', '$', '`',
'~', '\\', '(',
')',
];
const PF_DOT_SPECIAL_CHARACTERS = PF_SPECIAL_CHARACTERS.concat(['.', '=']);
const SPECIAL_CHARACTERS_EQUAL = SPECIAL_CHARACTERS.concat(['=']);
const SPECIAL_CHARACTERS_COMMA = SPECIAL_CHARACTERS.concat([',']);
const SPECIAL_CHARACTERS_BRACE = SPECIAL_CHARACTERS.concat(['{', '}']);
const ARG_IDENTIFIER_BASE = choice(
repeat1(noneOf(...SPECIAL_CHARACTERS)),
'$$$',
'$$',
'$',
/\$[^@|#"'>;`~\\({) ]/,
/\${[^\r\n $}]+}/,
/\\./,
/\/[^\*]/,
);
const ARG_IDENTIFIER_BRACE = choice(
repeat1(noneOf(...SPECIAL_CHARACTERS_BRACE)),
'$$$',
'$$',
'$',
/\$[^@|#"'>;`~\\({) ]/,
/\${[^\r\n $}]+}/,
/\\./,
/\/[^\*]/,
);
const PF_DOT_ARG_IDENTIFIER_BASE = choice(
repeat1(noneOf(...PF_DOT_SPECIAL_CHARACTERS)),
'$$$',
'$$',
'$',
/\$[^@|#"'>;`~\\({) ]/,
/\${[^\r\n $}]+}/,
/\\./,
/\/[^\*]/,
);
const PF_ARG_IDENTIFIER_BASE = choice(
repeat1(noneOf(...PF_SPECIAL_CHARACTERS)),
'$$$',
'$$',
'$',
/\$[^@|#"'>;`~\\({) ]/,
/\${[^\r\n $}]+}/,
/\\./,
/\/[^\*]/,
);
module.exports = grammar({
name: 'r2cmd',
extras: $ => [
$._comment,
/[ \t]*/,
],
externals: $ => [
$.cmd_identifier,
$._help_command,
$.file_descriptor,
$._eq_sep_concat,
$._concat,
$._concat_brace,
$._concat_pf_dot,
],
inline: $ => [
$.cmd_delimiter,
$.cmd_delimiter_singleline,
$._comment,
],
rules: {
commands: $ => choice(
seq(),
seq(repeat($.cmd_delimiter)),
seq(
repeat($.cmd_delimiter),
$._command,
repeat(seq($.cmd_delimiter, optional($._command)))
),
),
_commands_singleline: $ => prec(1,seq(
repeat($.cmd_delimiter_singleline),
$._command,
repeat(seq($.cmd_delimiter_singleline, optional($._command)))
)),
_command: $ => choice(
$.redirect_command,
$._simple_command,
),
legacy_quoted_command: $ => seq(
'"',
field('string', token(prec(-1, /([^"\\]|\\(.|\n))+/))),
'"',
),
_simple_command: $ => choice(
$.help_command,
$.repeat_command,
$.arged_command,
$.number_command,
$.task_command,
$._tmp_command,
$._iter_command,
$._foreach_command,
$._pipe_command,
$.grep_command,
$.last_command,
$.legacy_quoted_command,
$._pf_commands,
),
_tmp_command: $ => choice(
$.tmp_seek_command,
$.tmp_blksz_command,
$.tmp_fromto_command,
$.tmp_arch_command,
$.tmp_bits_command,
$.tmp_nthi_command,
$.tmp_eval_command,
$.tmp_fs_command,
$.tmp_reli_command,
$.tmp_kuery_command,
$.tmp_fd_command,
$.tmp_reg_command,
$.tmp_file_command,
$.tmp_string_command,
$.tmp_hex_command,
),
_iter_command: $ => choice(
$.iter_flags_command,
$.iter_dbta_command,
$.iter_dbtb_command,
$.iter_dbts_command,
$.iter_file_lines_command,
$.iter_offsets_command,
$.iter_sdbquery_command,
$.iter_threads_command,
$.iter_bbs_command,
$.iter_instrs_command,
$.iter_sections_command,
$.iter_functions_command,
$.iter_step_command,
$.iter_interpret_command,
$.iter_hit_command,
),
_foreach_command: $ => choice(
$.foreach_addrsize_command,
$.foreach_bb_command,
$.foreach_cmd_command,
$.foreach_comment_command,
$.foreach_import_command,
$.foreach_register_command,
$.foreach_symbol_command,
$.foreach_string_command,
$.foreach_section_command,
$.foreach_iomap_command,
$.foreach_dbgmap_command,
$.foreach_flag_command,
$.foreach_function_command,
$.foreach_thread_command,
),
_pipe_command: $ => choice(
$.html_disable_command,
$.html_enable_command,
$.pipe_command,
$.scr_tts_command,
),
grep_command: $ => seq(
field('command', $._simple_command),
'~',
field('specifier', $.grep_specifier),
),
// FIXME: improve parser for grep specifier
// grep_specifier_identifier also includes ~ because r2 does not support nested grep commands yet
grep_specifier_identifier: $ => token(seq(repeat1(
choice(
/[^\n\r;#@>|`$()]+/,
/\\./,
/\$[^(\r\n;#>|`]/,
)
))),
grep_specifier: $ => prec.left(choice(
seq(
repeat1(
choice(
$.grep_specifier_identifier,
$.cmd_substitution_arg,
),
),
optional(alias(/[$]+/, $.grep_specifier_identifier)),
),
alias(/[$]+/, $.grep_specifier_identifier),
)),
html_disable_command: $ => prec.right(1, seq(
field('command', $._simple_command),
'|'
)),
html_enable_command: $ => prec.right(1, seq(
field('command', $._simple_command),
'|H'
)),
scr_tts_command: $ => prec.right(1, seq(
field('command', $._simple_command),
'|T'
)),
pipe_command: $ => seq($._simple_command, '|', $.pipe_second_command),
pipe_second_command: $ => /[^|\r\n;]+/,
foreach_addrsize_command: $ => prec.right(1, seq($._simple_command, '@@@=', repeat1(seq($.arg, $.arg)))),
foreach_bb_command: $ => prec.right(1, seq($._simple_command, '@@@b')),
foreach_cmd_command: $ => prec.right(1, seq($._simple_command, '@@@c:', $._simple_command)),
foreach_comment_command: $ => prec.right(1, seq($._simple_command, '@@@C:', $.arg)),
foreach_import_command: $ => prec.right(1, seq($._simple_command, '@@@i')),
foreach_register_command: $ => prec.right(1, seq($._simple_command, '@@@r')),
foreach_symbol_command: $ => prec.right(1, seq($._simple_command, '@@@s')),
foreach_string_command: $ => prec.right(1, seq($._simple_command, '@@@st')),
foreach_section_command: $ => prec.right(1, seq($._simple_command, '@@@S')),
foreach_iomap_command: $ => prec.right(1, seq($._simple_command, '@@@m')),
foreach_dbgmap_command: $ => prec.right(1, seq($._simple_command, '@@@M')),
foreach_flag_command: $ => prec.right(1,
choice(
seq($._simple_command, '@@@f'),
seq($._simple_command, '@@@f:', $.arg),
),
),
foreach_function_command: $ => prec.right(1,
choice(
seq($._simple_command, '@@@F'),
seq($._simple_command, '@@@F:', $.arg)
)
),
foreach_thread_command: $ => prec.right(1, seq($._simple_command, '@@@t')),
iter_flags_command: $ => prec.right(1, seq($._simple_command, '@@', $.arg)),
iter_dbta_command: $ => prec.right(1, seq($._simple_command, choice('@@dbt', '@@dbta'))),
iter_dbtb_command: $ => prec.right(1, seq($._simple_command, '@@dbtb')),
iter_dbts_command: $ => prec.right(1, seq($._simple_command, '@@dbts')),
iter_file_lines_command: $ => prec.right(1, seq($._simple_command, '@@.', $.arg)),
iter_offsets_command: $ => prec.right(1, seq($._simple_command, '@@=', optional($.args))),
iter_sdbquery_command: $ => prec.right(1, seq($._simple_command, '@@k', $.arg)),
iter_threads_command: $ => prec.right(1, seq($._simple_command, '@@t')),
iter_bbs_command: $ => prec.right(1, seq($._simple_command, '@@b')),
iter_instrs_command: $ => prec.right(1, seq($._simple_command, '@@i')),
iter_sections_command: $ => prec.right(1, seq($._simple_command, '@@iS')),
iter_functions_command: $ => prec.right(1, seq($._simple_command, '@@f', optional(seq(':', $.arg)))),
iter_step_command: $ => prec.right(1, seq($._simple_command, '@@s:', $.arg, $.arg, $.arg)),
iter_interpret_command: $ => prec.right(1, seq($._simple_command, '@@c:', $._simple_command)),
iter_hit_command: $ => prec.right(1, seq(
$._simple_command,
'@@',
$._concat,
alias($._search_command, $.arged_command)
)),
// tmp changes commands
tmp_seek_command: $ => prec.right(1, seq($._simple_command, '@', $.arg)),
tmp_blksz_command: $ => prec.right(1, seq($._simple_command, '@!', $.arg)),
// NOTE: need to use special arg_brace here because of https://github.com/radareorg/radare2/commit/c3dee9332c19f874ac2cc9294a9ffe17575d8141
tmp_fromto_command: $ => prec.right(1, seq(
$._simple_command,
'@{',
alias($.arg_brace, $.arg),
alias($.arg_brace, $.arg),
'}'
)),
tmp_arch_command: $ => prec.right(1, seq($._simple_command, '@a:', $.arg)),
tmp_bits_command: $ => prec.right(1, seq($._simple_command, '@b:', $.arg)),
tmp_nthi_command: $ => prec.right(1, seq($._simple_command, '@B:', $.arg)),
tmp_eval_command: $ => prec.right(1, seq($._simple_command, '@e:', $.tmp_eval_args)),
tmp_fs_command: $ => prec.right(1, seq($._simple_command, '@F:', $.arg)),
tmp_reli_command: $ => prec.right(1, seq($._simple_command, '@i:', $.arg)),
tmp_kuery_command: $ => prec.right(1, seq($._simple_command, '@k:', $.arg)),
tmp_fd_command: $ => prec.right(1, seq($._simple_command, '@o:', $.arg)),
tmp_reg_command: $ => prec.right(1, seq($._simple_command, '@r:', $.arg)),
tmp_file_command: $ => prec.right(1, seq($._simple_command, '@f:', $.arg)),
tmp_string_command: $ => prec.right(1, seq($._simple_command, '@s:', $.arg)),
tmp_hex_command: $ => prec.right(1, seq($._simple_command, '@x:', $.arg)),
_interpreter_command: $ => prec.right(1, seq(
field('command', alias('#!', $.cmd_identifier)),
field('args', optional($.args)),
)),
// basic commands
task_command: $ => prec.left(1, choice(
seq(
field('command', alias(choice('&', '&t'), $.cmd_identifier)),
field('args', $._simple_command),
),
seq(
field('command', alias(/&[A-Za-z=\-+*&0-9]*/, $.cmd_identifier)),
field('args', optional($.args)),
),
)),
number_command: $ => choice(
$._dec_number,
'0',
/(0x[0-9A-Fa-f]+|0b[0-1]+)/,
),
help_command: $ => prec.left(1, choice(
field('command', alias($.question_mark_identifier, $.cmd_identifier)),
seq(
field('command', alias(choice($._help_command, '#?', '#!?'), $.cmd_identifier)),
field('args', optional($.args)),
),
)),
arged_command: $ => choice(
$._simple_arged_command,
$._math_arged_command,
$._pointer_arged_command,
$._macro_arged_command,
$._system_command,
$._interpret_command,
$._env_command,
$._interpreter_command,
$._pf_arged_command,
),
_simple_arged_command: $ => prec.left(1, seq(
field('command', $.cmd_identifier),
field('args', optional($.args)),
)),
_search_command: $ => prec.left(1, seq(
field('command', alias(/\/[A-Za-z0-9+!\/*]*/, $.cmd_identifier)),
field('args', optional($.args)),
)),
_math_arged_command: $ => prec.left(1, seq(
field('command', alias($.question_mark_identifier, $.cmd_identifier)),
field('args', $.args),
)),
_pointer_arged_command: $ => prec.left(1, seq(
field('command', alias($.pointer_identifier, $.cmd_identifier)),
field('args', alias($.eq_sep_args, $.args)),
)),
_macro_arged_command: $ => prec.left(1, seq(
field('command', alias($.macro_identifier, $.cmd_identifier)),
field('args', optional($.macro_args)),
)),
_system_command: $ => prec.left(1, seq(
field('command', $.system_identifier),
optional(field('args', $.args)),
)),
_interpret_command: $ => prec.left(1, choice(
seq(
field('command', alias('.', $.cmd_identifier)),
field('args', $._simple_command),
),
seq(
field('command', alias($._interpret_identifier, $.cmd_identifier)),
field('args', optional($.args)),
),
seq(
field('command', alias('.!', $.cmd_identifier)),
field('args', $.interpret_arg),
),
seq(
field('command', alias('.(', $.cmd_identifier)),
field('args', $.macro_call_content),
),
seq(
field('command', alias($._interpret_search_identifier, $.cmd_identifier)),
field('args', $.args),
),
prec.right(1, seq(
field('args', $._simple_command),
field('command', '|.'),
)),
)),
_interpret_search_identifier: $ => seq('./'),
_pf_arged_command: $ => choice(
seq(
field('command', alias($.pf_dot_cmd_identifier, $.cmd_identifier)),
),
seq(
field('command', alias('pfo', $.cmd_identifier)),
field('args', $.args),
),
),
_pf_commands: $ => prec.left(1, choice(
// pf fmt, pf* fmt_name|fmt, pfc fmt_name|fmt, pfd.fmt_name, pfj fmt_name|fmt, pfq fmt, pfs.struct_name, pfs format
alias($.pf_cmd, $.arged_command),
// pf.fmt_name.field_name, pf.fmt_name.field_name[i], pf.fmt_name.field_name=33, pfv.fmt_name[.field]
alias($.pf_dot_cmd, $.arged_command),
// pf.name [0|cnt]fmt
alias($.pf_new_cmd, $.arged_command),
// Cf [sz] [fmt]
alias($.Cf_cmd, $.arged_command),
// pf., pfo fdf_name: will be handled as regular arged_command
)),
Cf_cmd: $ => prec.left(seq(
field('command', alias('Cf', $.cmd_identifier)),
optional(field('args', alias($._Cf_args, $.args))),
)),
_Cf_args: $ => seq(
$.arg,
$.pf_args,
),
pf_dot_cmd_identifier: $ => 'pf.',
pf_dot_full_cmd_identifier: $ => /pf[*cjqsv]\./,
pf_new_cmd: $ => seq(
field('command', alias($.pf_dot_cmd_identifier, $.cmd_identifier)),
$._concat_pf_dot,
field('args', $.pf_new_args),
),
pf_dot_cmd: $ => prec.left(1, seq(
field('command', alias(choice($.pf_dot_cmd_identifier, $.pf_dot_full_cmd_identifier), $.cmd_identifier)),
$._concat_pf_dot,
field('args', $.pf_dot_cmd_args),
)),
pf_cmd: $ => seq(
field('command', alias(/pf[*cjqs]?/, $.cmd_identifier)),
field('args', $.pf_args),
),
pf_new_args: $ => seq(
alias($.pf_dot_arg, $.pf_arg),
$.pf_args,
),
pf_dot_cmd_args: $ => seq(
alias($.pf_dot_args, $.pf_args),
optional(seq(
alias('=', $.pf_arg_identifier),
$.pf_args,
)),
),
_pf_dot_arg_identifier: $ => token(seq(
repeat1(PF_DOT_ARG_IDENTIFIER_BASE),
)),
_pf_arg_parentheses: $ => seq(
alias('(', $.pf_arg_identifier),
$.pf_args,
alias(')', $.pf_arg_identifier),
),
pf_arg_identifier: $ => token(seq(
repeat1(PF_ARG_IDENTIFIER_BASE),
)),
_pf_arg: $ => choice(
$.pf_arg_identifier,
$._pf_arg_parentheses,
$.cmd_substitution_arg,
),
_pf_dot_arg: $ => choice(
alias($._pf_dot_arg_identifier, $.pf_arg_identifier),
$.cmd_substitution_arg,
),
pf_concatenation: $ => prec(-1, seq(
$._pf_arg,
repeat1(prec(-1, seq(
$._concat,
$._pf_arg,
))),
)),
pf_dot_concatenation: $ => prec(-1, seq(
$._pf_dot_arg,
repeat1(prec(-1, seq(
$._concat_pf_dot,
$._pf_dot_arg,
))),
)),
pf_arg: $ => choice(
$._pf_arg,
$.pf_concatenation
),
pf_dot_arg: $ => choice(
$._pf_dot_arg,
alias($.pf_dot_concatenation, $.pf_concatenation),
),
pf_args: $ => prec.left(repeat1($.pf_arg)),
pf_dot_args: $ => prec.left(1, seq(
alias($.pf_dot_arg, $.pf_arg),
repeat(seq(
$._concat_pf_dot,
'.',
$._concat_pf_dot,
alias($.pf_dot_arg, $.pf_arg),
)),
)),
_env_command: $ => prec.left(seq(
field('command', alias($._env_command_identifier, $.cmd_identifier)),
field('args', optional(alias($.eq_sep_args, $.args))),
)),
_env_command_identifier: $ => choice('%', 'env'),
last_command: $ => seq(
field('command', alias($.last_command_identifier, $.cmd_identifier)),
),
last_command_identifier: $ => choice('.', '...'),
_interpret_identifier: $ => prec(1, choice(
/\.[\.:\-*]+[ ]*/,
/\.[ ]+/,
)),
interpret_arg: $ => $._any_command,
system_identifier: $ => /![\*!-=]*/,
question_mark_identifier: $ => '?',
repeat_command: $ => prec.left(1, seq(
field('arg', alias($._dec_number, $.number)),
field('command', $._simple_command),
)),
pointer_identifier: $ => '*',
eq_sep_args: $ => seq(
alias($.eq_sep_key, $.args),
optional(seq(
alias('=', $.arg_identifier),
alias($.eq_sep_val, $.args)
)),
),
macro_identifier: $ => /\([-\*]?/,
macro_call_content: $ => prec.left(seq(
optional($.args),
')',
)),
macro_call_full_content: $ => seq('(', $.macro_call_content),
macro_content: $ => prec(1, seq(
field('name', $.arg),
optional($.args),
optional(seq(
';',
$._command,
repeat(seq(';', $._command)),
)),
')',
)),
macro_args: $ => seq(
$.macro_content,
optional(
seq(
optional($.macro_call_full_content),
)
),
),
redirect_command: $ => prec.right(2, seq(
field('command', $._simple_command),
field('redirect_operator', $._redirect_operator),
field('arg', $.arg),
)),
_redirect_operator: $ => choice(
$.fdn_redirect_operator,
$.fdn_append_operator,
$.html_redirect_operator,
$.html_append_operator,
),
fdn_redirect_operator: $ => seq(optional($.file_descriptor), '>'),
fdn_append_operator: $ => seq(optional($.file_descriptor), '>>'),
html_redirect_operator: $ => 'H>',
html_append_operator: $ => 'H>>',
_arg: $ => choice(
$.arg_identifier,
$.double_quoted_arg,
$.single_quoted_arg,
$.cmd_substitution_arg,
seq(
alias('(', $.arg_identifier),
$.args,
alias(')', $.arg_identifier),
),
alias(',', $.arg_identifier),
),
_arg_brace: $ => choice(
alias($.arg_identifier_brace, $.arg_identifier),
$.double_quoted_arg,
$.single_quoted_arg,
$.cmd_substitution_arg,
seq(
alias('(', $.arg_identifier),
$._arg_brace,
alias(')', $.arg_identifier),
),
alias(',', $.arg_identifier),
),
arg: $ => choice(
$._arg,
$.concatenation,
),
arg_brace: $ => choice(
$._arg_brace,
alias($.concatenation_brace, $.concatenation),
),
args: $ => prec.left(repeat1($.arg)),
// TODO: this should accept a quoted_arg and a cmd_substitution_arg as well
tmp_eval_args: $ => prec.left(seq($.tmp_eval_arg, repeat(seq(',', $.tmp_eval_arg)))),
tmp_eval_arg: $ => repeat1(noneOf(...SPECIAL_CHARACTERS_COMMA)),
_eq_sep_key_single: $ => choice(
alias ($._eq_sep_key_identifier, $.arg_identifier),
$.double_quoted_arg,
$.single_quoted_arg,
$.cmd_substitution_arg,
),
eq_sep_key: $ => prec.left(seq(
alias($._eq_sep_key_single, $.arg),
repeat(seq(
$._eq_sep_concat,
alias($._eq_sep_key_single, $.arg),
)),
)),
_eq_sep_key_identifier: $ => token(repeat1(
choice(
repeat1(noneOf(...SPECIAL_CHARACTERS_EQUAL)),
/\$[^({]/,
/\${[^\r\n $}]+}/,
escape(...SPECIAL_CHARACTERS_EQUAL),
)
)),
eq_sep_val: $ => prec.left(seq(
$.arg,
repeat(seq(
$._eq_sep_concat,
$.arg,
)),
)),
_any_command: $ => /[^\r\n;~|]+/,
arg_identifier: $ => token(repeat1(ARG_IDENTIFIER_BASE)),
arg_identifier_brace: $ => token(repeat1(ARG_IDENTIFIER_BRACE)),
double_quoted_arg: $ => seq(
'"',
repeat(choice(
/[^\\"\n$`]+/,
/\$[^("]?/,
/\\[\\"\n$`]?/,
$.cmd_substitution_arg,
)),
'"',
),
single_quoted_arg: $ => seq(
'\'',
repeat(choice(
/[^\\'\n]+/,
/\\[\\'\n]?/,
)),
'\'',
),
cmd_substitution_arg: $ => choice(
seq('$(', $._commands_singleline, ')'),
prec(1, seq('`', $._commands_singleline, '`')),
),
concatenation: $ => prec(-1, seq(
$._arg,
repeat1(prec(-1, seq(
$._concat,
$._arg,
))),
)),
concatenation_brace: $ => prec(-1, seq(
$._arg_brace,
repeat1(prec(-1, seq(
$._concat_brace,
$._arg_brace,
))),
)),
_dec_number: $ => choice(/[1-9][0-9]*/, /[0-9][0-9]+/),
_comment: $ => token(choice(
'#',
/#[^!][^\r\n]*/,
seq('/*', /[^*]*\*+([^/*][^*]*\*+)*/, '/')
)),
cmd_delimiter: $ => choice(
'\n',
'\r',
$.cmd_delimiter_singleline,
),
cmd_delimiter_singleline: $ => choice(';'),
}
});
function noneOf(...characters) {
const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join('')
return new RegExp('[^' + negatedString + ']')
}

View File

@ -0,0 +1,13 @@
try {
module.exports = require("./build/Release/tree_sitter_r2cmd_binding");
} catch (error) {
try {
module.exports = require("./build/Debug/tree_sitter_r2cmd_binding");
} catch (_) {
throw error
}
}
try {
module.exports.nodeTypeInfo = require("./src/node-types.json");
} catch (_) {}

View File

@ -0,0 +1,19 @@
{
"name": "tree-sitter-r2cmd",
"version": "1.0.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"nan": {
"version": "2.14.0",
"resolved": "https://registry.npmjs.org/nan/-/nan-2.14.0.tgz",
"integrity": "sha512-INOFj37C7k3AfaNTtX8RhsTw7qRy7eLET14cROi9+5HAVbbHuIWUHEauBv5qT4Av2tWasiTY1Jw6puUNqRJXQg=="
},
"tree-sitter-cli": {
"version": "0.16.4",
"resolved": "https://registry.npmjs.org/tree-sitter-cli/-/tree-sitter-cli-0.16.4.tgz",
"integrity": "sha512-akCVeK7oOZD+frizRbBx3h6OBlVBxOCNtfpt9nz3zvOdRuJTwoyJUshzF28J+hfcuvQ+yfoZx9/R+2S7NZE2TA==",
"dev": true
}
}
}

View File

@ -0,0 +1,17 @@
{
"name": "tree-sitter-r2cmd",
"version": "1.0.0",
"description": "Tree-Sitter grammar for parsing radare2 commands",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Riccardo Schirone",
"license": "ISC",
"dependencies": {
"nan": "^2.14.0"
},
"devDependencies": {
"tree-sitter-cli": "^0.16.4"
}
}

View File

@ -0,0 +1,28 @@
#include "tree_sitter/parser.h"
#include <node.h>
#include "nan.h"
using namespace v8;
extern "C" TSLanguage * tree_sitter_r2cmd();
namespace {
NAN_METHOD(New) {}
void Init(Local<Object> exports, Local<Object> module) {
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
tpl->InstanceTemplate()->SetInternalFieldCount(1);
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_r2cmd());
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("r2cmd").ToLocalChecked());
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
}
NODE_MODULE(tree_sitter_r2cmd_binding, Init)
} // namespace

4012
shlr/radare2-shell-parser/src/grammar.json generated Normal file

File diff suppressed because it is too large Load Diff

14136
shlr/radare2-shell-parser/src/node-types.json generated Normal file

File diff suppressed because it is too large Load Diff

35833
shlr/radare2-shell-parser/src/parser.c generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,169 @@
#include <tree_sitter/parser.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#define CMD_IDENTIFIER_MAX_LENGTH 32
enum TokenType {
CMD_IDENTIFIER,
HELP_COMMAND,
FILE_DESCRIPTOR,
EQ_SEP_CONCAT,
CONCAT,
CONCAT_BRACE,
CONCAT_PF_DOT,
};
void *tree_sitter_r2cmd_external_scanner_create() {
return NULL;
}
void tree_sitter_r2cmd_external_scanner_destroy(void *payload) {
}
unsigned tree_sitter_r2cmd_external_scanner_serialize(void *payload, char *buffer) {
return 0;
}
void tree_sitter_r2cmd_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
}
static bool is_pf_cmd(const char *s) {
return !strncmp (s, "pf", 2) || !strcmp (s, "Cf");
}
static bool is_env_cmd(const char *s) {
return !strncmp (s, "env", 3);
}
static bool is_at_cmd(const char *s) {
return s[0] == '@';
}
static bool is_comment(const char *s) {
return !strncmp (s, "/*", 2);
}
static bool is_special_start(const int32_t ch) {
return ch == '*' || ch == '(' || ch == '*' || ch == '@' || ch == '|' ||
ch == '.' || ch == '|' || ch == '%' || ch == '~' || ch == '&';
}
static bool is_start_of_command(const int32_t ch) {
return isalpha (ch) || ch == '$' || ch == '?' || ch == ':' || ch == '+' ||
ch == '=' || ch == '/' || ch == '_' || is_special_start (ch);
}
static bool is_mid_command(const char *res, const int32_t ch) {
return isalnum(ch) || ch == '$' || ch == '?' || ch == '.' || ch == '!' ||
ch == ':' || ch == '+' || ch == '=' || ch == '/' || ch == '*' ||
ch == '-' || ch == ',' || ch == '&' || (is_at_cmd (res) && ch == '@');
}
static bool is_concat(const int32_t ch) {
return ch != '\0' && !isspace(ch) && ch != '#' && ch != '@' &&
ch != '|' && ch != '>' && ch != ';' &&
ch != ')' && ch != '`' && ch != '~' && ch != '\\';
}
static bool is_concat_brace(const int32_t ch) {
return is_concat(ch) && ch != '}' && ch != '{';
}
static bool is_concat_pf_dot(const int32_t ch) {
return is_concat(ch) && ch != '=';
}
static bool is_recursive_help(int id_len, const int32_t before_last_ch, const int32_t last_ch) {
return id_len >= 2 && before_last_ch == '?' && last_ch == '*';
}
static bool scan_number(TSLexer *lexer, const bool *valid_symbols) {
if (!valid_symbols[FILE_DESCRIPTOR]) {
return false;
}
// skip spaces at the beginning
while (isspace (lexer->lookahead)) {
lexer->advance (lexer, true);
}
if (!isdigit (lexer->lookahead)) {
return false;
}
lexer->advance (lexer, false);
for (;;) {
if (isdigit (lexer->lookahead)) {
lexer->advance (lexer, false);
} else if (lexer->lookahead != '>') {
return false;
} else {
break;
}
}
if (lexer->lookahead == '>') {
lexer->result_symbol = FILE_DESCRIPTOR;
return true;
}
return false;
}
bool tree_sitter_r2cmd_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
// FIXME: /* in the shell should become a multiline comment
if (valid_symbols[EQ_SEP_CONCAT] && !isspace(lexer->lookahead) && lexer->lookahead != '=' && lexer->lookahead != '\0') {
lexer->result_symbol = EQ_SEP_CONCAT;
return true;
}
if (valid_symbols[CONCAT] && is_concat (lexer->lookahead)) {
lexer->result_symbol = CONCAT;
return true;
} else if (valid_symbols[CONCAT_BRACE] && is_concat_brace (lexer->lookahead)) {
lexer->result_symbol = CONCAT_BRACE;
return true;
} else if (valid_symbols[CONCAT_PF_DOT] && is_concat_pf_dot (lexer->lookahead)) {
lexer->result_symbol = CONCAT_PF_DOT;
return true;
}
if (valid_symbols[CMD_IDENTIFIER] || valid_symbols[HELP_COMMAND]) {
char res[CMD_IDENTIFIER_MAX_LENGTH + 1];
int i_res = 0;
while (isspace (lexer->lookahead)) {
lexer->advance (lexer, true);
}
if (!is_start_of_command (lexer->lookahead)) {
return false;
}
res[i_res++] = lexer->lookahead;
if (res[0] == '#') {
return false;
}
lexer->advance (lexer, false);
while (i_res < CMD_IDENTIFIER_MAX_LENGTH && is_mid_command (res, lexer->lookahead)) {
res[i_res++] = lexer->lookahead;
lexer->advance (lexer, false);
}
res[i_res] = '\0';
if (is_comment (res)) {
return false;
}
if (res[i_res - 1] == '?' || (i_res >= 2 && is_recursive_help(i_res, res[i_res - 2], res[i_res - 1]))) {
if (i_res == 1) {
return false;
}
lexer->result_symbol = HELP_COMMAND;
} else {
if (is_special_start (res[0]) || is_pf_cmd (res) || is_env_cmd (res) || is_at_cmd (res) || !valid_symbols[CMD_IDENTIFIER]) {
return false;
}
lexer->result_symbol = CMD_IDENTIFIER;
}
return true;
}
if (valid_symbols[FILE_DESCRIPTOR]) {
return scan_number (lexer, valid_symbols);
}
return false;
}

View File

@ -0,0 +1,223 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef uint16_t TSStateId;
typedef struct {
bool visible : 1;
bool named : 1;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef struct {
union {
struct {
TSStateId state;
bool extra : 1;
bool repetition : 1;
};
struct {
TSSymbol symbol;
int16_t dynamic_precedence;
uint8_t child_count;
uint8_t production_id;
};
} params;
TSParseActionType type : 4;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable : 1;
};
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
const char **symbol_names;
const TSSymbolMetadata *symbol_metadata;
const uint16_t *parse_table;
const TSParseActionEntry *parse_actions;
const TSLexMode *lex_modes;
const TSSymbol *alias_sequences;
uint16_t max_alias_sequence_length;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
uint32_t field_count;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const char **field_names;
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSSymbol *public_symbol_map;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = {.state = state_value}, \
} \
}
#define SHIFT_REPEAT(state_value) \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = { \
.state = state_value, \
.repetition = true \
}, \
} \
}
#define RECOVER() \
{ \
{ .type = TSParseActionTypeRecover } \
}
#define SHIFT_EXTRA() \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = {.extra = true} \
} \
}
#define REDUCE(symbol_val, child_count_val, ...) \
{ \
{ \
.type = TSParseActionTypeReduce, \
.params = { \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
} \
} \
}
#define ACCEPT_INPUT() \
{ \
{ .type = TSParseActionTypeAccept } \
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

View File

@ -0,0 +1,876 @@
#ifndef TREE_SITTER_API_H_
#define TREE_SITTER_API_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
/****************************/
/* Section - ABI Versioning */
/****************************/
/**
* The latest ABI version that is supported by the current version of the
* library. When Languages are generated by the Tree-sitter CLI, they are
* assigned an ABI version number that corresponds to the current CLI version.
* The Tree-sitter library is generally backwards-compatible with languages
* generated using older CLI versions, but is not forwards-compatible.
*/
#define TREE_SITTER_LANGUAGE_VERSION 11
/**
* The earliest ABI version that is supported by the current version of the
* library.
*/
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
/*******************/
/* Section - Types */
/*******************/
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
typedef struct TSParser TSParser;
typedef struct TSTree TSTree;
typedef struct TSQuery TSQuery;
typedef struct TSQueryCursor TSQueryCursor;
typedef enum {
TSInputEncodingUTF8,
TSInputEncodingUTF16,
} TSInputEncoding;
typedef enum {
TSSymbolTypeRegular,
TSSymbolTypeAnonymous,
TSSymbolTypeAuxiliary,
} TSSymbolType;
typedef struct {
uint32_t row;
uint32_t column;
} TSPoint;
typedef struct {
TSPoint start_point;
TSPoint end_point;
uint32_t start_byte;
uint32_t end_byte;
} TSRange;
typedef struct {
void *payload;
const char *(*read)(void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read);
TSInputEncoding encoding;
} TSInput;
typedef enum {
TSLogTypeParse,
TSLogTypeLex,
} TSLogType;
typedef struct {
void *payload;
void (*log)(void *payload, TSLogType, const char *);
} TSLogger;
typedef struct {
uint32_t start_byte;
uint32_t old_end_byte;
uint32_t new_end_byte;
TSPoint start_point;
TSPoint old_end_point;
TSPoint new_end_point;
} TSInputEdit;
typedef struct {
uint32_t context[4];
const void *id;
const TSTree *tree;
} TSNode;
typedef struct {
const void *tree;
const void *id;
uint32_t context[2];
} TSTreeCursor;
typedef struct {
TSNode node;
uint32_t index;
} TSQueryCapture;
typedef struct {
uint32_t id;
uint16_t pattern_index;
uint16_t capture_count;
const TSQueryCapture *captures;
} TSQueryMatch;
typedef enum {
TSQueryPredicateStepTypeDone,
TSQueryPredicateStepTypeCapture,
TSQueryPredicateStepTypeString,
} TSQueryPredicateStepType;
typedef struct {
TSQueryPredicateStepType type;
uint32_t value_id;
} TSQueryPredicateStep;
typedef enum {
TSQueryErrorNone = 0,
TSQueryErrorSyntax,
TSQueryErrorNodeType,
TSQueryErrorField,
TSQueryErrorCapture,
} TSQueryError;
/********************/
/* Section - Parser */
/********************/
/**
* Create a new parser.
*/
TSParser *ts_parser_new(void);
/**
* Delete the parser, freeing all of the memory that it used.
*/
void ts_parser_delete(TSParser *parser);
/**
* Set the language that the parser should use for parsing.
*
* Returns a boolean indicating whether or not the language was successfully
* assigned. True means assignment succeeded. False means there was a version
* mismatch: the language was generated with an incompatible version of the
* Tree-sitter CLI. Check the language's version using `ts_language_version`
* and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION` and
* `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants.
*/
bool ts_parser_set_language(TSParser *self, const TSLanguage *language);
/**
* Get the parser's current language.
*/
const TSLanguage *ts_parser_language(const TSParser *self);
/**
* Set the ranges of text that the parser should include when parsing.
*
* By default, the parser will always include entire documents. This function
* allows you to parse only a *portion* of a document but still return a syntax
* tree whose ranges match up with the document as a whole. You can also pass
* multiple disjoint ranges.
*
* The second and third parameters specify the location and length of an array
* of ranges. The parser does *not* take ownership of these ranges; it copies
* the data, so it doesn't matter how these ranges are allocated.
*
* If `length` is zero, then the entire document will be parsed. Otherwise,
* the given ranges must be ordered from earliest to latest in the document,
* and they must not overlap. That is, the following must hold for all
* `i` < `length - 1`:
*
* ranges[i].end_byte <= ranges[i + 1].start_byte
*
* If this requirement is not satisfied, the operation will fail, the ranges
* will not be assigned, and this function will return `false`. On success,
* this function returns `true`
*/
bool ts_parser_set_included_ranges(
TSParser *self,
const TSRange *ranges,
uint32_t length
);
/**
* Get the ranges of text that the parser will include when parsing.
*
* The returned pointer is owned by the parser. The caller should not free it
* or write to it. The length of the array will be written to the given
* `length` pointer.
*/
const TSRange *ts_parser_included_ranges(
const TSParser *self,
uint32_t *length
);
/**
* Use the parser to parse some source code and create a syntax tree.
*
* If you are parsing this document for the first time, pass `NULL` for the
* `old_tree` parameter. Otherwise, if you have already parsed an earlier
* version of this document and the document has since been edited, pass the
* previous syntax tree so that the unchanged parts of it can be reused.
* This will save time and memory. For this to work correctly, you must have
* already edited the old syntax tree using the `ts_tree_edit` function in a
* way that exactly matches the source code changes.
*
* The `TSInput` parameter lets you specify how to read the text. It has the
* following three fields:
* 1. `read`: A function to retrieve a chunk of text at a given byte offset
* and (row, column) position. The function should return a pointer to the
* text and write its length to the the `bytes_read` pointer. The parser
* does not take ownership of this buffer; it just borrows it until it has
* finished reading it. The function should write a zero value to the
* `bytes_read` pointer to indicate the end of the document.
* 2. `payload`: An arbitrary pointer that will be passed to each invocation
* of the `read` function.
* 3. `encoding`: An indication of how the text is encoded. Either
* `TSInputEncodingUTF8` or `TSInputEncodingUTF16`.
*
* This function returns a syntax tree on success, and `NULL` on failure. There
* are three possible reasons for failure:
* 1. The parser does not have a language assigned. Check for this using the
`ts_parser_language` function.
* 2. Parsing was cancelled due to a timeout that was set by an earlier call to
* the `ts_parser_set_timeout_micros` function. You can resume parsing from
* where the parser left out by calling `ts_parser_parse` again with the
* same arguments. Or you can start parsing from scratch by first calling
* `ts_parser_reset`.
* 3. Parsing was cancelled using a cancellation flag that was set by an
* earlier call to `ts_parser_set_cancellation_flag`. You can resume parsing
* from where the parser left out by calling `ts_parser_parse` again with
* the same arguments.
*/
TSTree *ts_parser_parse(
TSParser *self,
const TSTree *old_tree,
TSInput input
);
/**
* Use the parser to parse some source code stored in one contiguous buffer.
* The first two parameters are the same as in the `ts_parser_parse` function
* above. The second two parameters indicate the location of the buffer and its
* length in bytes.
*/
TSTree *ts_parser_parse_string(
TSParser *self,
const TSTree *old_tree,
const char *string,
uint32_t length
);
/**
* Use the parser to parse some source code stored in one contiguous buffer with
* a given encoding. The first four parameters work the same as in the
* `ts_parser_parse_string` method above. The final parameter indicates whether
* the text is encoded as UTF8 or UTF16.
*/
TSTree *ts_parser_parse_string_encoding(
TSParser *self,
const TSTree *old_tree,
const char *string,
uint32_t length,
TSInputEncoding encoding
);
/**
* Instruct the parser to start the next parse from the beginning.
*
* If the parser previously failed because of a timeout or a cancellation, then
* by default, it will resume where it left off on the next call to
* `ts_parser_parse` or other parsing functions. If you don't want to resume,
* and instead intend to use this parser to parse some other document, you must
* call `ts_parser_reset` first.
*/
void ts_parser_reset(TSParser *self);
/**
* Set the maximum duration in microseconds that parsing should be allowed to
* take before halting.
*
* If parsing takes longer than this, it will halt early, returning NULL.
* See `ts_parser_parse` for more information.
*/
void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
/**
* Get the duration in microseconds that parsing is allowed to take.
*/
uint64_t ts_parser_timeout_micros(const TSParser *self);
/**
* Set the parser's current cancellation flag pointer.
*
* If a non-null pointer is assigned, then the parser will periodically read
* from this pointer during parsing. If it reads a non-zero value, it will
* halt early, returning NULL. See `ts_parser_parse` for more information.
*/
void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag);
/**
* Get the parser's current cancellation flag pointer.
*/
const size_t *ts_parser_cancellation_flag(const TSParser *self);
/**
* Set the logger that a parser should use during parsing.
*
* The parser does not take ownership over the logger payload. If a logger was
* previously assigned, the caller is responsible for releasing any memory
* owned by the previous logger.
*/
void ts_parser_set_logger(TSParser *self, TSLogger logger);
/**
* Get the parser's current logger.
*/
TSLogger ts_parser_logger(const TSParser *self);
/**
* Set the file descriptor to which the parser should write debugging graphs
* during parsing. The graphs are formatted in the DOT language. You may want
* to pipe these graphs directly to a `dot(1)` process in order to generate
* SVG output. You can turn off this logging by passing a negative number.
*/
void ts_parser_print_dot_graphs(TSParser *self, int file);
/******************/
/* Section - Tree */
/******************/
/**
* Create a shallow copy of the syntax tree. This is very fast.
*
* You need to copy a syntax tree in order to use it on more than one thread at
* a time, as syntax trees are not thread safe.
*/
TSTree *ts_tree_copy(const TSTree *self);
/**
* Delete the syntax tree, freeing all of the memory that it used.
*/
void ts_tree_delete(TSTree *self);
/**
* Get the root node of the syntax tree.
*/
TSNode ts_tree_root_node(const TSTree *self);
/**
* Get the language that was used to parse the syntax tree.
*/
const TSLanguage *ts_tree_language(const TSTree *);
/**
* Edit the syntax tree to keep it in sync with source code that has been
* edited.
*
* You must describe the edit both in terms of byte offsets and in terms of
* (row, column) coordinates.
*/
void ts_tree_edit(TSTree *self, const TSInputEdit *edit);
/**
* Compare an old edited syntax tree to a new syntax tree representing the same
* document, returning an array of ranges whose syntactic structure has changed.
*
* For this to work correctly, the old syntax tree must have been edited such
* that its ranges match up to the new tree. Generally, you'll want to call
* this function right after calling one of the `ts_parser_parse` functions.
* You need to pass the old tree that was passed to parse, as well as the new
* tree that was returned from that function.
*
* The returned array is allocated using `malloc` and the caller is responsible
* for freeing it using `free`. The length of the array will be written to the
* given `length` pointer.
*/
TSRange *ts_tree_get_changed_ranges(
const TSTree *old_tree,
const TSTree *new_tree,
uint32_t *length
);
/**
* Write a DOT graph describing the syntax tree to the given file.
*/
void ts_tree_print_dot_graph(const TSTree *, FILE *);
/******************/
/* Section - Node */
/******************/
/**
* Get the node's type as a null-terminated string.
*/
const char *ts_node_type(TSNode);
/**
* Get the node's type as a numerical id.
*/
TSSymbol ts_node_symbol(TSNode);
/**
* Get the node's start byte.
*/
uint32_t ts_node_start_byte(TSNode);
/**
* Get the node's start position in terms of rows and columns.
*/
TSPoint ts_node_start_point(TSNode);
/**
* Get the node's end byte.
*/
uint32_t ts_node_end_byte(TSNode);
/**
* Get the node's end position in terms of rows and columns.
*/
TSPoint ts_node_end_point(TSNode);
/**
* Get an S-expression representing the node as a string.
*
* This string is allocated with `malloc` and the caller is responsible for
* freeing it using `free`.
*/
char *ts_node_string(TSNode);
/**
* Check if the node is null. Functions like `ts_node_child` and
* `ts_node_next_sibling` will return a null node to indicate that no such node
* was found.
*/
bool ts_node_is_null(TSNode);
/**
* Check if the node is *named*. Named nodes correspond to named rules in the
* grammar, whereas *anonymous* nodes correspond to string literals in the
* grammar.
*/
bool ts_node_is_named(TSNode);
/**
* Check if the node is *missing*. Missing nodes are inserted by the parser in
* order to recover from certain kinds of syntax errors.
*/
bool ts_node_is_missing(TSNode);
/**
* Check if the node is *extra*. Extra nodes represent things like comments,
* which are not required the grammar, but can appear anywhere.
*/
bool ts_node_is_extra(TSNode);
/**
* Check if a syntax node has been edited.
*/
bool ts_node_has_changes(TSNode);
/**
* Check if the node is a syntax error or contains any syntax errors.
*/
bool ts_node_has_error(TSNode);
/**
* Get the node's immediate parent.
*/
TSNode ts_node_parent(TSNode);
/**
* Get the node's child at the given index, where zero represents the first
* child.
*/
TSNode ts_node_child(TSNode, uint32_t);
/**
* Get the node's number of children.
*/
uint32_t ts_node_child_count(TSNode);
/**
* Get the node's *named* child at the given index.
*
* See also `ts_node_is_named`.
*/
TSNode ts_node_named_child(TSNode, uint32_t);
/**
* Get the node's number of *named* children.
*
* See also `ts_node_is_named`.
*/
uint32_t ts_node_named_child_count(TSNode);
/**
* Get the node's child with the given field name.
*/
TSNode ts_node_child_by_field_name(
TSNode self,
const char *field_name,
uint32_t field_name_length
);
/**
* Get the node's child with the given numerical field id.
*
* You can convert a field name to an id using the
* `ts_language_field_id_for_name` function.
*/
TSNode ts_node_child_by_field_id(TSNode, TSFieldId);
/**
* Get the node's next / previous sibling.
*/
TSNode ts_node_next_sibling(TSNode);
TSNode ts_node_prev_sibling(TSNode);
/**
* Get the node's next / previous *named* sibling.
*/
TSNode ts_node_next_named_sibling(TSNode);
TSNode ts_node_prev_named_sibling(TSNode);
/**
* Get the node's first child that extends beyond the given byte offset.
*/
TSNode ts_node_first_child_for_byte(TSNode, uint32_t);
/**
* Get the node's first named child that extends beyond the given byte offset.
*/
TSNode ts_node_first_named_child_for_byte(TSNode, uint32_t);
/**
* Get the smallest node within this node that spans the given range of bytes
* or (row, column) positions.
*/
TSNode ts_node_descendant_for_byte_range(TSNode, uint32_t, uint32_t);
TSNode ts_node_descendant_for_point_range(TSNode, TSPoint, TSPoint);
/**
* Get the smallest named node within this node that spans the given range of
* bytes or (row, column) positions.
*/
TSNode ts_node_named_descendant_for_byte_range(TSNode, uint32_t, uint32_t);
TSNode ts_node_named_descendant_for_point_range(TSNode, TSPoint, TSPoint);
/**
* Edit the node to keep it in-sync with source code that has been edited.
*
* This function is only rarely needed. When you edit a syntax tree with the
* `ts_tree_edit` function, all of the nodes that you retrieve from the tree
* afterward will already reflect the edit. You only need to use `ts_node_edit`
* when you have a `TSNode` instance that you want to keep and continue to use
* after an edit.
*/
void ts_node_edit(TSNode *, const TSInputEdit *);
/**
* Check if two nodes are identical.
*/
bool ts_node_eq(TSNode, TSNode);
/************************/
/* Section - TreeCursor */
/************************/
/**
* Create a new tree cursor starting from the given node.
*
* A tree cursor allows you to walk a syntax tree more efficiently than is
* possible using the `TSNode` functions. It is a mutable object that is always
* on a certain syntax node, and can be moved imperatively to different nodes.
*/
TSTreeCursor ts_tree_cursor_new(TSNode);
/**
* Delete a tree cursor, freeing all of the memory that it used.
*/
void ts_tree_cursor_delete(TSTreeCursor *);
/**
* Re-initialize a tree cursor to start at a different node.
*/
void ts_tree_cursor_reset(TSTreeCursor *, TSNode);
/**
* Get the tree cursor's current node.
*/
TSNode ts_tree_cursor_current_node(const TSTreeCursor *);
/**
* Get the field name of the tree cursor's current node.
*
* This returns `NULL` if the current node doesn't have a field.
* See also `ts_node_child_by_field_name`.
*/
const char *ts_tree_cursor_current_field_name(const TSTreeCursor *);
/**
* Get the field name of the tree cursor's current node.
*
* This returns zero if the current node doesn't have a field.
* See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`.
*/
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *);
/**
* Move the cursor to the parent of its current node.
*
* This returns `true` if the cursor successfully moved, and returns `false`
* if there was no parent node (the cursor was already on the root node).
*/
bool ts_tree_cursor_goto_parent(TSTreeCursor *);
/**
* Move the cursor to the next sibling of its current node.
*
* This returns `true` if the cursor successfully moved, and returns `false`
* if there was no next sibling node.
*/
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
/**
* Move the cursor to the first child of its current node.
*
* This returns `true` if the cursor successfully moved, and returns `false`
* if there were no children.
*/
bool ts_tree_cursor_goto_first_child(TSTreeCursor *);
/**
* Move the cursor to the first child of its current node that extends beyond
* the given byte offset.
*
* This returns the index of the child node if one was found, and returns -1
* if no such child was found.
*/
int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t);
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *);
/*******************/
/* Section - Query */
/*******************/
/**
* Create a new query from a string containing one or more S-expression
* patterns. The query is associated with a particular language, and can
* only be run on syntax nodes parsed with that language.
*
* If all of the given patterns are valid, this returns a `TSQuery`.
* If a pattern is invalid, this returns `NULL`, and provides two pieces
* of information about the problem:
* 1. The byte offset of the error is written to the `error_offset` parameter.
* 2. The type of error is written to the `error_type` parameter.
*/
TSQuery *ts_query_new(
const TSLanguage *language,
const char *source,
uint32_t source_len,
uint32_t *error_offset,
TSQueryError *error_type
);
/**
* Delete a query, freeing all of the memory that it used.
*/
void ts_query_delete(TSQuery *);
/**
* Get the number of patterns, captures, or string literals in the query.
*/
uint32_t ts_query_pattern_count(const TSQuery *);
uint32_t ts_query_capture_count(const TSQuery *);
uint32_t ts_query_string_count(const TSQuery *);
/**
* Get the byte offset where the given pattern starts in the query's source.
*
* This can be useful when combining queries by concatenating their source
* code strings.
*/
uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t);
/**
* Get all of the predicates for the given pattern in the query.
*
* The predicates are represented as a single array of steps. There are three
* types of steps in this array, which correspond to the three legal values for
* the `type` field:
* - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names
* of captures. Their `value_id` can be used with the
* `ts_query_capture_name_for_id` function to obtain the name of the capture.
* - `TSQueryPredicateStepTypeString` - Steps with this type represent literal
* strings. Their `value_id` can be used with the
* `ts_query_string_value_for_id` function to obtain their string value.
* - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels*
* that represent the end of an individual predicate. If a pattern has two
* predicates, then there will be two steps with this `type` in the array.
*/
const TSQueryPredicateStep *ts_query_predicates_for_pattern(
const TSQuery *self,
uint32_t pattern_index,
uint32_t *length
);
/**
* Get the name and length of one of the query's captures, or one of the
* query's string literals. Each capture and string is associated with a
* numeric id based on the order that it appeared in the query's source.
*/
const char *ts_query_capture_name_for_id(
const TSQuery *,
uint32_t id,
uint32_t *length
);
const char *ts_query_string_value_for_id(
const TSQuery *,
uint32_t id,
uint32_t *length
);
/**
* Disable a certain capture within a query.
*
* This prevents the capture from being returned in matches, and also avoids
* any resource usage associated with recording the capture. Currently, there
* is no way to undo this.
*/
void ts_query_disable_capture(TSQuery *, const char *, uint32_t);
/**
* Disable a certain pattern within a query.
*
* This prevents the pattern from matching and removes most of the overhead
* associated with the pattern. Currently, there is no way to undo this.
*/
void ts_query_disable_pattern(TSQuery *, uint32_t);
/**
* Create a new cursor for executing a given query.
*
* The cursor stores the state that is needed to iteratively search
* for matches. To use the query cursor, first call `ts_query_cursor_exec`
* to start running a given query on a given syntax node. Then, there are
* two options for consuming the results of the query:
* 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the
* the *matches* in the order that they were found. Each match contains the
* index of the pattern that matched, and an array of captures. Because
* multiple patterns can match the same set of nodes, one match may contain
* captures that appear *before* some of the captures from a previous match.
* 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the
* individual *captures* in the order that they appear. This is useful if
* don't care about which pattern matched, and just want a single ordered
* sequence of captures.
*
* If you don't care about consuming all of the results, you can stop calling
* `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point.
* You can then start executing another query on another node by calling
* `ts_query_cursor_exec` again.
*/
TSQueryCursor *ts_query_cursor_new(void);
/**
* Delete a query cursor, freeing all of the memory that it used.
*/
void ts_query_cursor_delete(TSQueryCursor *);
/**
* Start running a given query on a given node.
*/
void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);
/**
* Set the range of bytes or (row, column) positions in which the query
* will be executed.
*/
void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t);
void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint);
/**
* Advance to the next match of the currently running query.
*
* If there is a match, write it to `*match` and return `true`.
* Otherwise, return `false`.
*/
bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match);
void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id);
/**
* Advance to the next capture of the currently running query.
*
* If there is a capture, write its match to `*match` and its index within
* the matche's capture list to `*capture_index`. Otherwise, return `false`.
*/
bool ts_query_cursor_next_capture(
TSQueryCursor *,
TSQueryMatch *match,
uint32_t *capture_index
);
/**********************/
/* Section - Language */
/**********************/
/**
* Get the number of distinct node types in the language.
*/
uint32_t ts_language_symbol_count(const TSLanguage *);
/**
* Get a node type string for the given numerical id.
*/
const char *ts_language_symbol_name(const TSLanguage *, TSSymbol);
/**
* Get the numerical id for the given node type string.
*/
TSSymbol ts_language_symbol_for_name(
const TSLanguage *self,
const char *string,
uint32_t length,
bool is_named
);
/**
* Get the number of distinct field names in the language.
*/
uint32_t ts_language_field_count(const TSLanguage *);
/**
* Get the field name string for the given numerical id.
*/
const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId);
/**
* Get the numerical id for the given field name string.
*/
TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t);
/**
* Check whether the given node type id belongs to named nodes, anonymous nodes,
* or a hidden nodes.
*
* See also `ts_node_is_named`. Hidden nodes are never returned from the API.
*/
TSSymbolType ts_language_symbol_type(const TSLanguage *, TSSymbol);
/**
* Get the ABI version number for this language. This version number is used
* to ensure that languages were generated by a compatible version of
* Tree-sitter.
*
* See also `ts_parser_set_language`.
*/
uint32_t ts_language_version(const TSLanguage *);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_API_H_

View File

@ -0,0 +1,223 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef uint16_t TSStateId;
typedef struct {
bool visible : 1;
bool named : 1;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef struct {
union {
struct {
TSStateId state;
bool extra : 1;
bool repetition : 1;
};
struct {
TSSymbol symbol;
int16_t dynamic_precedence;
uint8_t child_count;
uint8_t production_id;
};
} params;
TSParseActionType type : 4;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable : 1;
};
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
const char **symbol_names;
const TSSymbolMetadata *symbol_metadata;
const uint16_t *parse_table;
const TSParseActionEntry *parse_actions;
const TSLexMode *lex_modes;
const TSSymbol *alias_sequences;
uint16_t max_alias_sequence_length;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
uint32_t field_count;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const char **field_names;
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSSymbol *public_symbol_map;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = {.state = state_value}, \
} \
}
#define SHIFT_REPEAT(state_value) \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = { \
.state = state_value, \
.repetition = true \
}, \
} \
}
#define RECOVER() \
{ \
{ .type = TSParseActionTypeRecover } \
}
#define SHIFT_EXTRA() \
{ \
{ \
.type = TSParseActionTypeShift, \
.params = {.extra = true} \
} \
}
#define REDUCE(symbol_val, child_count_val, ...) \
{ \
{ \
.type = TSParseActionTypeReduce, \
.params = { \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
} \
} \
}
#define ACCEPT_INPUT() \
{ \
{ .type = TSParseActionTypeAccept } \
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

View File

@ -0,0 +1,81 @@
#ifndef TREE_SITTER_ALLOC_H_
#define TREE_SITTER_ALLOC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#if defined(TREE_SITTER_TEST)
void *ts_record_malloc(size_t);
void *ts_record_calloc(size_t, size_t);
void *ts_record_realloc(void *, size_t);
void ts_record_free(void *);
bool ts_toggle_allocation_recording(bool);
static inline void *ts_malloc(size_t size) {
return ts_record_malloc(size);
}
static inline void *ts_calloc(size_t count, size_t size) {
return ts_record_calloc(count, size);
}
static inline void *ts_realloc(void *buffer, size_t size) {
return ts_record_realloc(buffer, size);
}
static inline void ts_free(void *buffer) {
ts_record_free(buffer);
}
#else
#include <stdlib.h>
static inline bool ts_toggle_allocation_recording(bool value) {
return false;
}
static inline void *ts_malloc(size_t size) {
void *result = malloc(size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size);
exit(1);
}
return result;
}
static inline void *ts_calloc(size_t count, size_t size) {
void *result = calloc(count, size);
if (count > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size);
exit(1);
}
return result;
}
static inline void *ts_realloc(void *buffer, size_t size) {
void *result = realloc(buffer, size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size);
exit(1);
}
return result;
}
static inline void ts_free(void *buffer) {
free(buffer);
}
#endif
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_ALLOC_H_

View File

@ -0,0 +1,142 @@
#ifndef TREE_SITTER_ARRAY_H_
#define TREE_SITTER_ARRAY_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <stdbool.h>
#include "./alloc.h"
#define Array(T) \
struct { \
T *contents; \
uint32_t size; \
uint32_t capacity; \
}
#define array_init(self) \
((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL)
#define array_new() \
{ NULL, 0, 0 }
#define array_get(self, index) \
(assert((uint32_t)index < (self)->size), &(self)->contents[index])
#define array_front(self) array_get(self, 0)
#define array_back(self) array_get(self, (self)->size - 1)
#define array_clear(self) ((self)->size = 0)
#define array_reserve(self, new_capacity) \
array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity)
#define array_erase(self, index) \
array__erase((VoidArray *)(self), array__elem_size(self), index)
#define array_delete(self) array__delete((VoidArray *)self)
#define array_push(self, element) \
(array__grow((VoidArray *)(self), 1, array__elem_size(self)), \
(self)->contents[(self)->size++] = (element))
#define array_grow_by(self, count) \
(array__grow((VoidArray *)(self), count, array__elem_size(self)), \
memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \
(self)->size += (count))
#define array_push_all(self, other) \
array_splice((self), (self)->size, 0, (other)->size, (other)->contents)
#define array_splice(self, index, old_count, new_count, new_contents) \
array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \
new_count, new_contents)
#define array_insert(self, index, element) \
array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element)
#define array_pop(self) ((self)->contents[--(self)->size])
#define array_assign(self, other) \
array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self))
// Private
typedef Array(void) VoidArray;
#define array__elem_size(self) sizeof(*(self)->contents)
static inline void array__delete(VoidArray *self) {
ts_free(self->contents);
self->contents = NULL;
self->size = 0;
self->capacity = 0;
}
static inline void array__erase(VoidArray *self, size_t element_size,
uint32_t index) {
assert(index < self->size);
char *contents = (char *)self->contents;
memmove(contents + index * element_size, contents + (index + 1) * element_size,
(self->size - index - 1) * element_size);
self->size--;
}
static inline void array__reserve(VoidArray *self, size_t element_size, uint32_t new_capacity) {
if (new_capacity > self->capacity) {
if (self->contents) {
self->contents = ts_realloc(self->contents, new_capacity * element_size);
} else {
self->contents = ts_calloc(new_capacity, element_size);
}
self->capacity = new_capacity;
}
}
static inline void array__assign(VoidArray *self, const VoidArray *other, size_t element_size) {
array__reserve(self, element_size, other->size);
self->size = other->size;
memcpy(self->contents, other->contents, self->size * element_size);
}
static inline void array__grow(VoidArray *self, size_t count, size_t element_size) {
size_t new_size = self->size + count;
if (new_size > self->capacity) {
size_t new_capacity = self->capacity * 2;
if (new_capacity < 8) new_capacity = 8;
if (new_capacity < new_size) new_capacity = new_size;
array__reserve(self, element_size, new_capacity);
}
}
static inline void array__splice(VoidArray *self, size_t element_size,
uint32_t index, uint32_t old_count,
uint32_t new_count, const void *elements) {
uint32_t new_size = self->size + new_count - old_count;
uint32_t old_end = index + old_count;
uint32_t new_end = index + new_count;
assert(old_end <= self->size);
array__reserve(self, element_size, new_size);
char *contents = (char *)self->contents;
if (self->size > old_end)
memmove(contents + new_end * element_size, contents + old_end * element_size,
(self->size - old_end) * element_size);
if (new_count > 0)
memcpy((contents + index * element_size), elements,
new_count * element_size);
self->size += new_count - old_count;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_ARRAY_H_

View File

@ -0,0 +1,42 @@
#ifndef TREE_SITTER_ATOMIC_H_
#define TREE_SITTER_ATOMIC_H_
#include <stdint.h>
#ifdef _WIN32
#include <windows.h>
static inline size_t atomic_load(const volatile size_t *p) {
return *p;
}
static inline uint32_t atomic_inc(volatile uint32_t *p) {
return InterlockedIncrement((long volatile *)p);
}
static inline uint32_t atomic_dec(volatile uint32_t *p) {
return InterlockedDecrement((long volatile *)p);
}
#else
static inline size_t atomic_load(const volatile size_t *p) {
#ifdef __ATOMIC_RELAXED
return __atomic_load_n(p, __ATOMIC_RELAXED);
#else
return __sync_fetch_and_add((volatile size_t *)p, 0);
#endif
}
static inline uint32_t atomic_inc(volatile uint32_t *p) {
return __sync_add_and_fetch(p, 1u);
}
static inline uint32_t atomic_dec(volatile uint32_t *p) {
return __sync_sub_and_fetch(p, 1u);
}
#endif
#endif // TREE_SITTER_ATOMIC_H_

View File

@ -0,0 +1,29 @@
#ifndef TREE_SITTER_BITS_H_
#define TREE_SITTER_BITS_H_
#include <stdint.h>
static inline uint32_t bitmask_for_index(uint16_t id) {
return (1u << (31 - id));
}
#if defined _WIN32 && !defined __GNUC__
#include <intrin.h>
static inline uint32_t count_leading_zeros(uint32_t x) {
if (x == 0) return 32;
uint32_t result;
_BitScanReverse(&result, x);
return 31 - result;
}
#else
static inline uint32_t count_leading_zeros(uint32_t x) {
if (x == 0) return 32;
return __builtin_clz(x);
}
#endif
#endif // TREE_SITTER_BITS_H_

View File

@ -0,0 +1,141 @@
#ifndef TREE_SITTER_CLOCK_H_
#define TREE_SITTER_CLOCK_H_
#include <stdint.h>
typedef uint64_t TSDuration;
#ifdef _WIN32
// Windows:
// * Represent a time as a performance counter value.
// * Represent a duration as a number of performance counter ticks.
#include <windows.h>
typedef uint64_t TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return micros * (uint64_t)frequency.QuadPart / 1000000;
}
static inline uint64_t duration_to_micros(TSDuration self) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return self * 1000000 / (uint64_t)frequency.QuadPart;
}
static inline TSClock clock_null(void) {
return 0;
}
static inline TSClock clock_now(void) {
LARGE_INTEGER result;
QueryPerformanceCounter(&result);
return (uint64_t)result.QuadPart;
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
return base + duration;
}
static inline bool clock_is_null(TSClock self) {
return !self;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
return self > other;
}
#elif defined(CLOCK_MONOTONIC) && !defined(__APPLE__)
// POSIX with monotonic clock support (Linux)
// * Represent a time as a monotonic (seconds, nanoseconds) pair.
// * Represent a duration as a number of microseconds.
//
// On these platforms, parse timeouts will correspond accurately to
// real time, regardless of what other processes are running.
#include <time.h>
typedef struct timespec TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
return micros;
}
static inline uint64_t duration_to_micros(TSDuration self) {
return self;
}
static inline TSClock clock_now(void) {
TSClock result;
clock_gettime(CLOCK_MONOTONIC, &result);
return result;
}
static inline TSClock clock_null(void) {
return (TSClock) {0, 0};
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
TSClock result = base;
result.tv_sec += duration / 1000000;
result.tv_nsec += (duration % 1000000) * 1000;
return result;
}
static inline bool clock_is_null(TSClock self) {
return !self.tv_sec;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
if (self.tv_sec > other.tv_sec) return true;
if (self.tv_sec < other.tv_sec) return false;
return self.tv_nsec > other.tv_nsec;
}
#else
// macOS or POSIX without monotonic clock support
// * Represent a time as a process clock value.
// * Represent a duration as a number of process clock ticks.
//
// On these platforms, parse timeouts may be affected by other processes,
// which is not ideal, but is better than using a non-monotonic time API
// like `gettimeofday`.
#include <time.h>
typedef uint64_t TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
return micros * (uint64_t)CLOCKS_PER_SEC / 1000000;
}
static inline uint64_t duration_to_micros(TSDuration self) {
return self * 1000000 / (uint64_t)CLOCKS_PER_SEC;
}
static inline TSClock clock_null(void) {
return 0;
}
static inline TSClock clock_now(void) {
return (uint64_t)clock();
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
return base + duration;
}
static inline bool clock_is_null(TSClock self) {
return !self;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
return self > other;
}
#endif
#endif // TREE_SITTER_CLOCK_H_

View File

@ -0,0 +1,11 @@
#ifndef TREE_SITTER_ERROR_COSTS_H_
#define TREE_SITTER_ERROR_COSTS_H_
#define ERROR_STATE 0
#define ERROR_COST_PER_RECOVERY 500
#define ERROR_COST_PER_MISSING_TREE 110
#define ERROR_COST_PER_SKIPPED_TREE 100
#define ERROR_COST_PER_SKIPPED_LINE 30
#define ERROR_COST_PER_SKIPPED_CHAR 1
#endif

View File

@ -0,0 +1,482 @@
#include "./get_changed_ranges.h"
#include "./subtree.h"
#include "./language.h"
#include "./error_costs.h"
#include "./tree_cursor.h"
#include <assert.h>
// #define DEBUG_GET_CHANGED_RANGES
static void ts_range_array_add(TSRangeArray *self, Length start, Length end) {
if (self->size > 0) {
TSRange *last_range = array_back(self);
if (start.bytes <= last_range->end_byte) {
last_range->end_byte = end.bytes;
last_range->end_point = end.extent;
return;
}
}
if (start.bytes < end.bytes) {
TSRange range = { start.extent, end.extent, start.bytes, end.bytes };
array_push(self, range);
}
}
bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index,
uint32_t start_byte, uint32_t end_byte) {
for (unsigned i = start_index; i < self->size; i++) {
TSRange *range = &self->contents[i];
if (range->end_byte > start_byte) {
if (range->start_byte >= end_byte) break;
return true;
}
}
return false;
}
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
) {
unsigned new_index = 0;
unsigned old_index = 0;
Length current_position = length_zero();
bool in_old_range = false;
bool in_new_range = false;
while (old_index < old_range_count || new_index < new_range_count) {
const TSRange *old_range = &old_ranges[old_index];
const TSRange *new_range = &new_ranges[new_index];
Length next_old_position;
if (in_old_range) {
next_old_position = (Length) {old_range->end_byte, old_range->end_point};
} else if (old_index < old_range_count) {
next_old_position = (Length) {old_range->start_byte, old_range->start_point};
} else {
next_old_position = LENGTH_MAX;
}
Length next_new_position;
if (in_new_range) {
next_new_position = (Length) {new_range->end_byte, new_range->end_point};
} else if (new_index < new_range_count) {
next_new_position = (Length) {new_range->start_byte, new_range->start_point};
} else {
next_new_position = LENGTH_MAX;
}
if (next_old_position.bytes < next_new_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_old_position);
}
if (in_old_range) old_index++;
current_position = next_old_position;
in_old_range = !in_old_range;
} else if (next_new_position.bytes < next_old_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_new_range) new_index++;
current_position = next_new_position;
in_new_range = !in_new_range;
} else {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_old_range) old_index++;
if (in_new_range) new_index++;
in_old_range = !in_old_range;
in_new_range = !in_new_range;
current_position = next_new_position;
}
}
}
typedef struct {
TreeCursor cursor;
const TSLanguage *language;
unsigned visible_depth;
bool in_padding;
} Iterator;
static Iterator iterator_new(TreeCursor *cursor, const Subtree *tree, const TSLanguage *language) {
array_clear(&cursor->stack);
array_push(&cursor->stack, ((TreeCursorEntry){
.subtree = tree,
.position = length_zero(),
.child_index = 0,
.structural_child_index = 0,
}));
return (Iterator) {
.cursor = *cursor,
.language = language,
.visible_depth = 1,
.in_padding = false,
};
}
static bool iterator_done(Iterator *self) {
return self->cursor.stack.size == 0;
}
static Length iterator_start_position(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
if (self->in_padding) {
return entry.position;
} else {
return length_add(entry.position, ts_subtree_padding(*entry.subtree));
}
}
static Length iterator_end_position(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
Length result = length_add(entry.position, ts_subtree_padding(*entry.subtree));
if (self->in_padding) {
return result;
} else {
return length_add(result, ts_subtree_size(*entry.subtree));
}
}
static bool iterator_tree_is_visible(const Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
if (ts_subtree_visible(*entry.subtree)) return true;
if (self->cursor.stack.size > 1) {
Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->language,
parent.ptr->production_id
);
return alias_sequence && alias_sequence[entry.structural_child_index] != 0;
}
return false;
}
static void iterator_get_visible_state(const Iterator *self, Subtree *tree,
TSSymbol *alias_symbol, uint32_t *start_byte) {
uint32_t i = self->cursor.stack.size - 1;
if (self->in_padding) {
if (i == 0) return;
i--;
}
for (; i + 1 > 0; i--) {
TreeCursorEntry entry = self->cursor.stack.contents[i];
if (i > 0) {
const Subtree *parent = self->cursor.stack.contents[i - 1].subtree;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->language,
parent->ptr->production_id
);
if (alias_sequence) {
*alias_symbol = alias_sequence[entry.structural_child_index];
}
}
if (ts_subtree_visible(*entry.subtree) || *alias_symbol) {
*tree = *entry.subtree;
*start_byte = entry.position.bytes;
break;
}
}
}
static void iterator_ascend(Iterator *self) {
if (iterator_done(self)) return;
if (iterator_tree_is_visible(self) && !self->in_padding) self->visible_depth--;
if (array_back(&self->cursor.stack)->child_index > 0) self->in_padding = false;
self->cursor.stack.size--;
}
static bool iterator_descend(Iterator *self, uint32_t goal_position) {
if (self->in_padding) return false;
bool did_descend;
do {
did_descend = false;
TreeCursorEntry entry = *array_back(&self->cursor.stack);
Length position = entry.position;
uint32_t structural_child_index = 0;
for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) {
const Subtree *child = &entry.subtree->ptr->children[i];
Length child_left = length_add(position, ts_subtree_padding(*child));
Length child_right = length_add(child_left, ts_subtree_size(*child));
if (child_right.bytes > goal_position) {
array_push(&self->cursor.stack, ((TreeCursorEntry){
.subtree = child,
.position = position,
.child_index = i,
.structural_child_index = structural_child_index,
}));
if (iterator_tree_is_visible(self)) {
if (child_left.bytes > goal_position) {
self->in_padding = true;
} else {
self->visible_depth++;
}
return true;
}
did_descend = true;
break;
}
position = child_right;
if (!ts_subtree_extra(*child)) structural_child_index++;
}
} while (did_descend);
return false;
}
static void iterator_advance(Iterator *self) {
if (self->in_padding) {
self->in_padding = false;
if (iterator_tree_is_visible(self)) {
self->visible_depth++;
} else {
iterator_descend(self, 0);
}
return;
}
for (;;) {
if (iterator_tree_is_visible(self)) self->visible_depth--;
TreeCursorEntry entry = array_pop(&self->cursor.stack);
if (iterator_done(self)) return;
const Subtree *parent = array_back(&self->cursor.stack)->subtree;
uint32_t child_index = entry.child_index + 1;
if (ts_subtree_child_count(*parent) > child_index) {
Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree));
uint32_t structural_child_index = entry.structural_child_index;
if (!ts_subtree_extra(*entry.subtree)) structural_child_index++;
const Subtree *next_child = &parent->ptr->children[child_index];
array_push(&self->cursor.stack, ((TreeCursorEntry){
.subtree = next_child,
.position = position,
.child_index = child_index,
.structural_child_index = structural_child_index,
}));
if (iterator_tree_is_visible(self)) {
if (ts_subtree_padding(*next_child).bytes > 0) {
self->in_padding = true;
} else {
self->visible_depth++;
}
} else {
iterator_descend(self, 0);
}
break;
}
}
}
typedef enum {
IteratorDiffers,
IteratorMayDiffer,
IteratorMatches,
} IteratorComparison;
static IteratorComparison iterator_compare(const Iterator *old_iter, const Iterator *new_iter) {
Subtree old_tree = NULL_SUBTREE;
Subtree new_tree = NULL_SUBTREE;
uint32_t old_start = 0;
uint32_t new_start = 0;
TSSymbol old_alias_symbol = 0;
TSSymbol new_alias_symbol = 0;
iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start);
iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start);
if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches;
if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers;
if (
old_alias_symbol == new_alias_symbol &&
ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree)
) {
if (old_start == new_start &&
!ts_subtree_has_changes(old_tree) &&
ts_subtree_symbol(old_tree) != ts_builtin_sym_error &&
ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes &&
ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE &&
ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE &&
(ts_subtree_parse_state(old_tree) == ERROR_STATE) ==
(ts_subtree_parse_state(new_tree) == ERROR_STATE)) {
return IteratorMatches;
} else {
return IteratorMayDiffer;
}
}
return IteratorDiffers;
}
#ifdef DEBUG_GET_CHANGED_RANGES
static inline void iterator_print_state(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
TSPoint start = iterator_start_position(self).extent;
TSPoint end = iterator_end_position(self).extent;
const char *name = ts_language_symbol_name(self->language, ts_subtree_symbol(*entry.subtree));
printf(
"(%-25s %s\t depth:%u [%u, %u] - [%u, %u])",
name, self->in_padding ? "(p)" : " ",
self->visible_depth,
start.row + 1, start.column,
end.row + 1, end.column
);
}
#endif
unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges) {
TSRangeArray results = array_new();
Iterator old_iter = iterator_new(cursor1, old_tree, language);
Iterator new_iter = iterator_new(cursor2, new_tree, language);
unsigned included_range_difference_index = 0;
Length position = iterator_start_position(&old_iter);
Length next_position = iterator_start_position(&new_iter);
if (position.bytes < next_position.bytes) {
ts_range_array_add(&results, position, next_position);
position = next_position;
} else if (position.bytes > next_position.bytes) {
ts_range_array_add(&results, next_position, position);
next_position = position;
}
do {
#ifdef DEBUG_GET_CHANGED_RANGES
printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column);
iterator_print_state(&old_iter);
printf("\tvs\t");
iterator_print_state(&new_iter);
puts("");
#endif
// Compare the old and new subtrees.
IteratorComparison comparison = iterator_compare(&old_iter, &new_iter);
// Even if the two subtrees appear to be identical, they could differ
// internally if they contain a range of text that was previously
// excluded from the parse, and is now included, or vice-versa.
if (comparison == IteratorMatches && ts_range_array_intersects(
included_range_differences,
included_range_difference_index,
position.bytes,
iterator_end_position(&old_iter).bytes
)) {
comparison = IteratorMayDiffer;
}
bool is_changed = false;
switch (comparison) {
// If the subtrees are definitely identical, move to the end
// of both subtrees.
case IteratorMatches:
next_position = iterator_end_position(&old_iter);
break;
// If the subtrees might differ internally, descend into both
// subtrees, finding the first child that spans the current position.
case IteratorMayDiffer:
if (iterator_descend(&old_iter, position.bytes)) {
if (!iterator_descend(&new_iter, position.bytes)) {
is_changed = true;
next_position = iterator_end_position(&old_iter);
}
} else if (iterator_descend(&new_iter, position.bytes)) {
is_changed = true;
next_position = iterator_end_position(&new_iter);
} else {
next_position = length_min(
iterator_end_position(&old_iter),
iterator_end_position(&new_iter)
);
}
break;
// If the subtrees are different, record a change and then move
// to the end of both subtrees.
case IteratorDiffers:
is_changed = true;
next_position = length_min(
iterator_end_position(&old_iter),
iterator_end_position(&new_iter)
);
break;
}
// Ensure that both iterators are caught up to the current position.
while (
!iterator_done(&old_iter) &&
iterator_end_position(&old_iter).bytes <= next_position.bytes
) iterator_advance(&old_iter);
while (
!iterator_done(&new_iter) &&
iterator_end_position(&new_iter).bytes <= next_position.bytes
) iterator_advance(&new_iter);
// Ensure that both iterators are at the same depth in the tree.
while (old_iter.visible_depth > new_iter.visible_depth) {
iterator_ascend(&old_iter);
}
while (new_iter.visible_depth > old_iter.visible_depth) {
iterator_ascend(&new_iter);
}
if (is_changed) {
#ifdef DEBUG_GET_CHANGED_RANGES
printf(
" change: [[%u, %u] - [%u, %u]]\n",
position.extent.row + 1, position.extent.column,
next_position.extent.row + 1, next_position.extent.column
);
#endif
ts_range_array_add(&results, position, next_position);
}
position = next_position;
// Keep track of the current position in the included range differences
// array in order to avoid scanning the entire array on each iteration.
while (included_range_difference_index < included_range_differences->size) {
const TSRange *range = &included_range_differences->contents[
included_range_difference_index
];
if (range->end_byte <= position.bytes) {
included_range_difference_index++;
} else {
break;
}
}
} while (!iterator_done(&old_iter) && !iterator_done(&new_iter));
Length old_size = ts_subtree_total_size(*old_tree);
Length new_size = ts_subtree_total_size(*new_tree);
if (old_size.bytes < new_size.bytes) {
ts_range_array_add(&results, old_size, new_size);
} else if (new_size.bytes < old_size.bytes) {
ts_range_array_add(&results, new_size, old_size);
}
*cursor1 = old_iter.cursor;
*cursor2 = new_iter.cursor;
*ranges = results.contents;
return results.size;
}

View File

@ -0,0 +1,36 @@
#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_
#define TREE_SITTER_GET_CHANGED_RANGES_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./tree_cursor.h"
#include "./subtree.h"
typedef Array(TSRange) TSRangeArray;
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
);
bool ts_range_array_intersects(
const TSRangeArray *self, unsigned start_index,
uint32_t start_byte, uint32_t end_byte
);
unsigned ts_subtree_get_changed_ranges(
const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges
);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_GET_CHANGED_RANGES_H_

View File

@ -0,0 +1,147 @@
#include "./language.h"
#include "./subtree.h"
#include "./error_costs.h"
#include <string.h>
uint32_t ts_language_symbol_count(const TSLanguage *self) {
return self->symbol_count + self->alias_count;
}
uint32_t ts_language_version(const TSLanguage *self) {
return self->version;
}
uint32_t ts_language_field_count(const TSLanguage *self) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
return self->field_count;
} else {
return 0;
}
}
void ts_language_table_entry(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol,
TableEntry *result
) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
result->action_count = 0;
result->is_reusable = false;
result->actions = NULL;
} else {
assert(symbol < self->token_count);
uint32_t action_index = ts_language_lookup(self, state, symbol);
const TSParseActionEntry *entry = &self->parse_actions[action_index];
result->action_count = entry->count;
result->is_reusable = entry->reusable;
result->actions = (const TSParseAction *)(entry + 1);
}
}
TSSymbolMetadata ts_language_symbol_metadata(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return (TSSymbolMetadata){.visible = true, .named = true};
} else if (symbol == ts_builtin_sym_error_repeat) {
return (TSSymbolMetadata){.visible = false, .named = false};
} else {
return self->symbol_metadata[symbol];
}
}
TSSymbol ts_language_public_symbol(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) return symbol;
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[symbol];
} else {
return symbol;
}
}
const char *ts_language_symbol_name(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return "ERROR";
} else if (symbol == ts_builtin_sym_error_repeat) {
return "_ERROR";
} else {
return self->symbol_names[symbol];
}
}
TSSymbol ts_language_symbol_for_name(
const TSLanguage *self,
const char *string,
uint32_t length,
bool is_named
) {
if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error;
uint32_t count = ts_language_symbol_count(self);
for (TSSymbol i = 0; i < count; i++) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i);
if (!metadata.visible || metadata.named != is_named) continue;
const char *symbol_name = self->symbol_names[i];
if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[i];
} else {
return i;
}
}
}
return 0;
}
TSSymbolType ts_language_symbol_type(
const TSLanguage *self,
TSSymbol symbol
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol);
if (metadata.named) {
return TSSymbolTypeRegular;
} else if (metadata.visible) {
return TSSymbolTypeAnonymous;
} else {
return TSSymbolTypeAuxiliary;
}
}
const char *ts_language_field_name_for_id(
const TSLanguage *self,
TSFieldId id
) {
uint32_t count = ts_language_field_count(self);
if (count) {
return self->field_names[id];
} else {
return NULL;
}
}
TSFieldId ts_language_field_id_for_name(
const TSLanguage *self,
const char *name,
uint32_t name_length
) {
uint32_t count = ts_language_field_count(self);
for (TSSymbol i = 1; i < count + 1; i++) {
switch (strncmp(name, self->field_names[i], name_length)) {
case 0:
if (self->field_names[i][name_length] == 0) return i;
break;
case -1:
return 0;
default:
break;
}
}
return 0;
}

View File

@ -0,0 +1,141 @@
#ifndef TREE_SITTER_LANGUAGE_H_
#define TREE_SITTER_LANGUAGE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./subtree.h"
#include "tree_sitter/parser.h"
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
typedef struct {
const TSParseAction *actions;
uint32_t action_count;
bool is_reusable;
} TableEntry;
void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *);
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol);
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
return 0 < symbol && symbol < self->external_token_count + 1;
}
static inline const TSParseAction *ts_language_actions(const TSLanguage *self,
TSStateId state,
TSSymbol symbol,
uint32_t *count) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
*count = entry.action_count;
return entry.actions;
}
static inline bool ts_language_has_actions(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
return entry.action_count > 0;
}
static inline bool ts_language_has_reduce_action(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
}
static inline uint16_t ts_language_lookup(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
if (
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count
) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *data = &self->small_parse_table[index];
uint16_t section_count = *(data++);
for (unsigned i = 0; i < section_count; i++) {
uint16_t section_value = *(data++);
uint16_t symbol_count = *(data++);
for (unsigned i = 0; i < symbol_count; i++) {
if (*(data++) == symbol) return section_value;
}
}
return 0;
} else {
return self->parse_table[state * self->symbol_count + symbol];
}
}
static inline TSStateId ts_language_next_state(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
return 0;
} else if (symbol < self->token_count) {
uint32_t count;
const TSParseAction *actions = ts_language_actions(self, state, symbol, &count);
if (count > 0) {
TSParseAction action = actions[count - 1];
if (action.type == TSParseActionTypeShift || action.type == TSParseActionTypeRecover) {
return action.params.state;
}
}
return 0;
} else {
return ts_language_lookup(self, state, symbol);
}
}
static inline const bool *
ts_language_enabled_external_tokens(const TSLanguage *self,
unsigned external_scanner_state) {
if (external_scanner_state == 0) {
return NULL;
} else {
return self->external_scanner.states + self->external_token_count * external_scanner_state;
}
}
static inline const TSSymbol *
ts_language_alias_sequence(const TSLanguage *self, uint32_t production_id) {
return production_id > 0 ?
self->alias_sequences + production_id * self->max_alias_sequence_length :
NULL;
}
static inline void ts_language_field_map(
const TSLanguage *self,
uint32_t production_id,
const TSFieldMapEntry **start,
const TSFieldMapEntry **end
) {
if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS || self->field_count == 0) {
*start = NULL;
*end = NULL;
return;
}
TSFieldMapSlice slice = self->field_map_slices[production_id];
*start = &self->field_map_entries[slice.index];
*end = &self->field_map_entries[slice.index] + slice.length;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_LANGUAGE_H_

View File

@ -0,0 +1,44 @@
#ifndef TREE_SITTER_LENGTH_H_
#define TREE_SITTER_LENGTH_H_
#include <stdlib.h>
#include <stdbool.h>
#include "./point.h"
#include "tree_sitter/api.h"
typedef struct {
uint32_t bytes;
TSPoint extent;
} Length;
static const Length LENGTH_UNDEFINED = {0, {0, 1}};
static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}};
static inline bool length_is_undefined(Length length) {
return length.bytes == 0 && length.extent.column != 0;
}
static inline Length length_min(Length len1, Length len2) {
return (len1.bytes < len2.bytes) ? len1 : len2;
}
static inline Length length_add(Length len1, Length len2) {
Length result;
result.bytes = len1.bytes + len2.bytes;
result.extent = point_add(len1.extent, len2.extent);
return result;
}
static inline Length length_sub(Length len1, Length len2) {
Length result;
result.bytes = len1.bytes - len2.bytes;
result.extent = point_sub(len1.extent, len2.extent);
return result;
}
static inline Length length_zero(void) {
Length result = {0, {0, 0}};
return result;
}
#endif

View File

@ -0,0 +1,391 @@
#include <stdio.h>
#include "./lexer.h"
#include "./subtree.h"
#include "./length.h"
#include "./unicode.h"
#define LOG(message, character) \
if (self->logger.log) { \
snprintf( \
self->debug_buffer, \
TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
32 <= character && character < 127 ? \
message " character:'%c'" : \
message " character:%d", \
character \
); \
self->logger.log( \
self->logger.payload, \
TSLogTypeLex, \
self->debug_buffer \
); \
}
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
static const TSRange DEFAULT_RANGE = {
.start_point = {
.row = 0,
.column = 0,
},
.end_point = {
.row = UINT32_MAX,
.column = UINT32_MAX,
},
.start_byte = 0,
.end_byte = UINT32_MAX
};
// Check if the lexer has reached EOF. This state is stored
// by setting the lexer's `current_included_range_index` such that
// it has consumed all of its available ranges.
static bool ts_lexer__eof(const TSLexer *_self) {
Lexer *self = (Lexer *)_self;
return self->current_included_range_index == self->included_range_count;
}
// Clear the currently stored chunk of source code, because the lexer's
// position has changed.
static void ts_lexer__clear_chunk(Lexer *self) {
self->chunk = NULL;
self->chunk_size = 0;
self->chunk_start = 0;
}
// Call the lexer's input callback to obtain a new chunk of source code
// for the current position.
static void ts_lexer__get_chunk(Lexer *self) {
self->chunk_start = self->current_position.bytes;
self->chunk = self->input.read(
self->input.payload,
self->current_position.bytes,
self->current_position.extent,
&self->chunk_size
);
if (!self->chunk_size) {
self->current_included_range_index = self->included_range_count;
self->chunk = NULL;
}
}
// Decode the next unicode character in the current chunk of source code.
// This assumes that the lexer has already retrieved a chunk of source
// code that spans the current position.
static void ts_lexer__get_lookahead(Lexer *self) {
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
uint32_t size = self->chunk_size - position_in_chunk;
if (size == 0) {
self->lookahead_size = 1;
self->data.lookahead = '\0';
return;
}
UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
? ts_decode_utf8
: ts_decode_utf16;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
// If this chunk ended in the middle of a multi-byte character,
// try again with a fresh chunk.
if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
ts_lexer__get_chunk(self);
chunk = (const uint8_t *)self->chunk;
size = self->chunk_size;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
}
if (self->data.lookahead == TS_DECODE_ERROR) {
self->lookahead_size = 1;
}
}
// Advance to the next character in the source code, retrieving a new
// chunk of source code if needed.
static void ts_lexer__advance(TSLexer *_self, bool skip) {
Lexer *self = (Lexer *)_self;
if (!self->chunk) return;
if (skip) {
LOG("skip", self->data.lookahead);
} else {
LOG("consume", self->data.lookahead);
}
if (self->lookahead_size) {
self->current_position.bytes += self->lookahead_size;
if (self->data.lookahead == '\n') {
self->current_position.extent.row++;
self->current_position.extent.column = 0;
} else {
self->current_position.extent.column += self->lookahead_size;
}
}
const TSRange *current_range = NULL;
if (self->current_included_range_index < self->included_range_count) {
current_range = &self->included_ranges[self->current_included_range_index];
if (self->current_position.bytes == current_range->end_byte) {
self->current_included_range_index++;
if (self->current_included_range_index < self->included_range_count) {
current_range++;
self->current_position = (Length) {
current_range->start_byte,
current_range->start_point,
};
} else {
current_range = NULL;
}
}
}
if (skip) self->token_start_position = self->current_position;
if (current_range) {
if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
ts_lexer__get_chunk(self);
}
ts_lexer__get_lookahead(self);
} else {
ts_lexer__clear_chunk(self);
self->data.lookahead = '\0';
self->lookahead_size = 1;
}
}
// Mark that a token match has completed. This can be called multiple
// times if a longer match is found later.
static void ts_lexer__mark_end(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
if (!ts_lexer__eof(&self->data)) {
// If the lexer is right at the beginning of included range,
// then the token should be considered to end at the *end* of the
// previous included range, rather than here.
TSRange *current_included_range = &self->included_ranges[
self->current_included_range_index
];
if (
self->current_included_range_index > 0 &&
self->current_position.bytes == current_included_range->start_byte
) {
TSRange *previous_included_range = current_included_range - 1;
self->token_end_position = (Length) {
previous_included_range->end_byte,
previous_included_range->end_point,
};
return;
}
}
self->token_end_position = self->current_position;
}
static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->current_position.bytes -= self->current_position.extent.column;
self->current_position.extent.column = 0;
if (self->current_position.bytes < self->chunk_start) {
ts_lexer__get_chunk(self);
}
uint32_t result = 0;
while (self->current_position.bytes < goal_byte) {
ts_lexer__advance(&self->data, false);
result++;
}
return result;
}
// Is the lexer at a boundary between two disjoint included ranges of
// source code? This is exposed as an API because some languages' external
// scanners need to perform custom actions at these bounaries.
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
const Lexer *self = (const Lexer *)_self;
if (self->current_included_range_index < self->included_range_count) {
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
return self->current_position.bytes == current_range->start_byte;
} else {
return false;
}
}
void ts_lexer_init(Lexer *self) {
*self = (Lexer) {
.data = {
// The lexer's methods are stored as struct fields so that generated
// parsers can call them without needing to be linked against this
// library.
.advance = ts_lexer__advance,
.mark_end = ts_lexer__mark_end,
.get_column = ts_lexer__get_column,
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
.eof = ts_lexer__eof,
.lookahead = 0,
.result_symbol = 0,
},
.chunk = NULL,
.chunk_size = 0,
.chunk_start = 0,
.current_position = {0, {0, 0}},
.logger = {
.payload = NULL,
.log = NULL
},
.included_ranges = NULL,
.included_range_count = 0,
.current_included_range_index = 0,
};
ts_lexer_set_included_ranges(self, NULL, 0);
}
void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
ts_lexer__clear_chunk(self);
ts_lexer_goto(self, self->current_position);
}
// Move the lexer to the given position. This doesn't do any work
// if the parser is already at the given position.
void ts_lexer_reset(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) {
ts_lexer_goto(self, position);
}
}
void ts_lexer_start(Lexer *self) {
self->token_start_position = self->current_position;
self->token_end_position = LENGTH_UNDEFINED;
self->data.result_symbol = 0;
if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if (
self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK
) ts_lexer__advance(&self->data, true);
}
}
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
if (length_is_undefined(self->token_end_position)) {
ts_lexer__mark_end(&self->data);
}
uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
// In order to determine that a byte sequence is invalid UTF8 or UTF16,
// the character decoding algorithm may have looked at the following byte.
// Therefore, the next byte *after* the current (invalid) character
// affects the interpretation of the current character.
if (self->data.lookahead == TS_DECODE_ERROR) {
current_lookahead_end_byte++;
}
if (current_lookahead_end_byte > *lookahead_end_byte) {
*lookahead_end_byte = current_lookahead_end_byte;
}
}
void ts_lexer_advance_to_end(Lexer *self) {
while (self->chunk) {
ts_lexer__advance(&self->data, false);
}
}
void ts_lexer_mark_end(Lexer *self) {
ts_lexer__mark_end(&self->data);
}
bool ts_lexer_set_included_ranges(
Lexer *self,
const TSRange *ranges,
uint32_t count
) {
if (count == 0 || !ranges) {
ranges = &DEFAULT_RANGE;
count = 1;
} else {
uint32_t previous_byte = 0;
for (unsigned i = 0; i < count; i++) {
const TSRange *range = &ranges[i];
if (
range->start_byte < previous_byte ||
range->end_byte < range->start_byte
) return false;
previous_byte = range->end_byte;
}
}
size_t size = count * sizeof(TSRange);
self->included_ranges = ts_realloc(self->included_ranges, size);
memcpy(self->included_ranges, ranges, size);
self->included_range_count = count;
ts_lexer_goto(self, self->current_position);
return true;
}
TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) {
*count = self->included_range_count;
return self->included_ranges;
}
#undef LOG

View File

@ -0,0 +1,48 @@
#ifndef TREE_SITTER_LEXER_H_
#define TREE_SITTER_LEXER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./length.h"
#include "./subtree.h"
#include "tree_sitter/api.h"
#include "tree_sitter/parser.h"
typedef struct {
TSLexer data;
Length current_position;
Length token_start_position;
Length token_end_position;
TSRange *included_ranges;
size_t included_range_count;
size_t current_included_range_index;
const char *chunk;
uint32_t chunk_start;
uint32_t chunk_size;
uint32_t lookahead_size;
TSInput input;
TSLogger logger;
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
} Lexer;
void ts_lexer_init(Lexer *);
void ts_lexer_delete(Lexer *);
void ts_lexer_set_input(Lexer *, TSInput);
void ts_lexer_reset(Lexer *, Length);
void ts_lexer_start(Lexer *);
void ts_lexer_finish(Lexer *, uint32_t *);
void ts_lexer_advance_to_end(Lexer *);
void ts_lexer_mark_end(Lexer *);
bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count);
TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_LEXER_H_

View File

@ -0,0 +1,17 @@
// The Tree-sitter library can be built by compiling this one source file.
//
// The following directories must be added to the include path:
// - include
#define _POSIX_C_SOURCE 200112L
#include "./get_changed_ranges.c"
#include "./language.c"
#include "./lexer.c"
#include "./node.c"
#include "./parser.c"
#include "./query.c"
#include "./stack.c"
#include "./subtree.c"
#include "./tree_cursor.c"
#include "./tree.c"

View File

@ -0,0 +1,675 @@
#include <stdbool.h>
#include "./subtree.h"
#include "./tree.h"
#include "./language.h"
typedef struct {
Subtree parent;
const TSTree *tree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
const TSSymbol *alias_sequence;
} NodeChildIterator;
// TSNode - constructors
TSNode ts_node_new(
const TSTree *tree,
const Subtree *subtree,
Length position,
TSSymbol alias
) {
return (TSNode) {
{position.bytes, position.extent.row, position.extent.column, alias},
subtree,
tree,
};
}
static inline TSNode ts_node__null(void) {
return ts_node_new(NULL, NULL, length_zero(), 0);
}
// TSNode - accessors
uint32_t ts_node_start_byte(TSNode self) {
return self.context[0];
}
TSPoint ts_node_start_point(TSNode self) {
return (TSPoint) {self.context[1], self.context[2]};
}
static inline uint32_t ts_node__alias(const TSNode *self) {
return self->context[3];
}
static inline Subtree ts_node__subtree(TSNode self) {
return *(const Subtree *)self.id;
}
// NodeChildIterator
static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) {
Subtree subtree = ts_node__subtree(*node);
if (ts_subtree_child_count(subtree) == 0) {
return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL};
}
const TSSymbol *alias_sequence = ts_language_alias_sequence(
node->tree->language,
subtree.ptr->production_id
);
return (NodeChildIterator) {
.tree = node->tree,
.parent = subtree,
.position = {ts_node_start_byte(*node), ts_node_start_point(*node)},
.child_index = 0,
.structural_child_index = 0,
.alias_sequence = alias_sequence,
};
}
static inline bool ts_node_child_iterator_done(NodeChildIterator *self) {
return self->child_index == self->parent.ptr->child_count;
}
static inline bool ts_node_child_iterator_next(
NodeChildIterator *self,
TSNode *result
) {
if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false;
const Subtree *child = &self->parent.ptr->children[self->child_index];
TSSymbol alias_symbol = 0;
if (!ts_subtree_extra(*child)) {
if (self->alias_sequence) {
alias_symbol = self->alias_sequence[self->structural_child_index];
}
self->structural_child_index++;
}
if (self->child_index > 0) {
self->position = length_add(self->position, ts_subtree_padding(*child));
}
*result = ts_node_new(
self->tree,
child,
self->position,
alias_symbol
);
self->position = length_add(self->position, ts_subtree_size(*child));
self->child_index++;
return true;
}
// TSNode - private
static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) {
Subtree tree = ts_node__subtree(self);
if (include_anonymous) {
return ts_subtree_visible(tree) || ts_node__alias(&self);
} else {
TSSymbol alias = ts_node__alias(&self);
if (alias) {
return ts_language_symbol_metadata(self.tree->language, alias).named;
} else {
return ts_subtree_visible(tree) && ts_subtree_named(tree);
}
}
}
static inline uint32_t ts_node__relevant_child_count(
TSNode self,
bool include_anonymous
) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
if (include_anonymous) {
return tree.ptr->visible_child_count;
} else {
return tree.ptr->named_child_count;
}
} else {
return 0;
}
}
static inline TSNode ts_node__child(
TSNode self,
uint32_t child_index,
bool include_anonymous
) {
TSNode result = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
uint32_t index = 0;
NodeChildIterator iterator = ts_node_iterate_children(&result);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node__is_relevant(child, include_anonymous)) {
if (index == child_index) {
ts_tree_set_cached_parent(self.tree, &child, &self);
return child;
}
index++;
} else {
uint32_t grandchild_index = child_index - index;
uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous);
if (grandchild_index < grandchild_count) {
did_descend = true;
result = child;
child_index = grandchild_index;
break;
}
index += grandchild_count;
}
}
}
return ts_node__null();
}
static bool ts_subtree_has_trailing_empty_descendant(
Subtree self,
Subtree other
) {
for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) {
Subtree child = self.ptr->children[i];
if (ts_subtree_total_bytes(child) > 0) break;
if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) {
return true;
}
}
return false;
}
static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) {
Subtree self_subtree = ts_node__subtree(self);
bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0;
uint32_t target_end_byte = ts_node_end_byte(self);
TSNode node = ts_node_parent(self);
TSNode earlier_node = ts_node__null();
bool earlier_node_is_relevant = false;
while (!ts_node_is_null(node)) {
TSNode earlier_child = ts_node__null();
bool earlier_child_is_relevant = false;
bool found_child_containing_target = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (child.id == self.id) break;
if (iterator.position.bytes > target_end_byte) {
found_child_containing_target = true;
break;
}
if (iterator.position.bytes == target_end_byte &&
(!self_is_empty ||
ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) {
found_child_containing_target = true;
break;
}
if (ts_node__is_relevant(child, include_anonymous)) {
earlier_child = child;
earlier_child_is_relevant = true;
} else if (ts_node__relevant_child_count(child, include_anonymous) > 0) {
earlier_child = child;
earlier_child_is_relevant = false;
}
}
if (found_child_containing_target) {
if (!ts_node_is_null(earlier_child)) {
earlier_node = earlier_child;
earlier_node_is_relevant = earlier_child_is_relevant;
}
node = child;
} else if (earlier_child_is_relevant) {
return earlier_child;
} else if (!ts_node_is_null(earlier_child)) {
node = earlier_child;
} else if (earlier_node_is_relevant) {
return earlier_node;
} else {
node = earlier_node;
}
}
return ts_node__null();
}
static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) {
uint32_t target_end_byte = ts_node_end_byte(self);
TSNode node = ts_node_parent(self);
TSNode later_node = ts_node__null();
bool later_node_is_relevant = false;
while (!ts_node_is_null(node)) {
TSNode later_child = ts_node__null();
bool later_child_is_relevant = false;
TSNode child_containing_target = ts_node__null();
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (iterator.position.bytes < target_end_byte) continue;
if (ts_node_start_byte(child) <= ts_node_start_byte(self)) {
if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) {
child_containing_target = child;
}
} else if (ts_node__is_relevant(child, include_anonymous)) {
later_child = child;
later_child_is_relevant = true;
break;
} else if (ts_node__relevant_child_count(child, include_anonymous) > 0) {
later_child = child;
later_child_is_relevant = false;
break;
}
}
if (!ts_node_is_null(child_containing_target)) {
if (!ts_node_is_null(later_child)) {
later_node = later_child;
later_node_is_relevant = later_child_is_relevant;
}
node = child_containing_target;
} else if (later_child_is_relevant) {
return later_child;
} else if (!ts_node_is_null(later_child)) {
node = later_child;
} else if (later_node_is_relevant) {
return later_node;
} else {
node = later_node;
}
}
return ts_node__null();
}
static inline TSNode ts_node__first_child_for_byte(
TSNode self,
uint32_t goal,
bool include_anonymous
) {
TSNode node = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node_end_byte(child) > goal) {
if (ts_node__is_relevant(child, include_anonymous)) {
return child;
} else if (ts_node_child_count(child) > 0) {
did_descend = true;
node = child;
break;
}
}
}
}
return ts_node__null();
}
static inline TSNode ts_node__descendant_for_byte_range(
TSNode self,
uint32_t range_start,
uint32_t range_end,
bool include_anonymous
) {
TSNode node = self;
TSNode last_visible_node = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
uint32_t node_end = iterator.position.bytes;
// The end of this node must extend far enough forward to touch
// the end of the range and exceed the start of the range.
if (node_end < range_end) continue;
if (node_end <= range_start) continue;
// The start of this node must extend far enough backward to
// touch the start of the range.
if (range_start < ts_node_start_byte(child)) break;
node = child;
if (ts_node__is_relevant(node, include_anonymous)) {
ts_tree_set_cached_parent(self.tree, &child, &last_visible_node);
last_visible_node = node;
}
did_descend = true;
break;
}
}
return last_visible_node;
}
static inline TSNode ts_node__descendant_for_point_range(
TSNode self,
TSPoint range_start,
TSPoint range_end,
bool include_anonymous
) {
TSNode node = self;
TSNode last_visible_node = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
TSPoint node_end = iterator.position.extent;
// The end of this node must extend far enough forward to touch
// the end of the range and exceed the start of the range.
if (point_lt(node_end, range_end)) continue;
if (point_lte(node_end, range_start)) continue;
// The start of this node must extend far enough backward to
// touch the start of the range.
if (point_lt(range_start, ts_node_start_point(child))) break;
node = child;
if (ts_node__is_relevant(node, include_anonymous)) {
ts_tree_set_cached_parent(self.tree, &child, &last_visible_node);
last_visible_node = node;
}
did_descend = true;
break;
}
}
return last_visible_node;
}
// TSNode - public
uint32_t ts_node_end_byte(TSNode self) {
return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes;
}
TSPoint ts_node_end_point(TSNode self) {
return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent);
}
TSSymbol ts_node_symbol(TSNode self) {
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_public_symbol(self.tree->language, symbol);
}
const char *ts_node_type(TSNode self) {
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_symbol_name(self.tree->language, symbol);
}
char *ts_node_string(TSNode self) {
return ts_subtree_string(ts_node__subtree(self), self.tree->language, false);
}
bool ts_node_eq(TSNode self, TSNode other) {
return self.tree == other.tree && self.id == other.id;
}
bool ts_node_is_null(TSNode self) {
return self.id == 0;
}
bool ts_node_is_extra(TSNode self) {
return ts_subtree_extra(ts_node__subtree(self));
}
bool ts_node_is_named(TSNode self) {
TSSymbol alias = ts_node__alias(&self);
return alias
? ts_language_symbol_metadata(self.tree->language, alias).named
: ts_subtree_named(ts_node__subtree(self));
}
bool ts_node_is_missing(TSNode self) {
return ts_subtree_missing(ts_node__subtree(self));
}
bool ts_node_has_changes(TSNode self) {
return ts_subtree_has_changes(ts_node__subtree(self));
}
bool ts_node_has_error(TSNode self) {
return ts_subtree_error_cost(ts_node__subtree(self)) > 0;
}
TSNode ts_node_parent(TSNode self) {
TSNode node = ts_tree_get_cached_parent(self.tree, &self);
if (node.id) return node;
node = ts_tree_root_node(self.tree);
uint32_t end_byte = ts_node_end_byte(self);
if (node.id == self.id) return ts_node__null();
TSNode last_visible_node = node;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (
ts_node_start_byte(child) > ts_node_start_byte(self) ||
child.id == self.id
) break;
if (iterator.position.bytes >= end_byte) {
node = child;
if (ts_node__is_relevant(child, true)) {
ts_tree_set_cached_parent(self.tree, &node, &last_visible_node);
last_visible_node = node;
}
did_descend = true;
break;
}
}
}
return last_visible_node;
}
TSNode ts_node_child(TSNode self, uint32_t child_index) {
return ts_node__child(self, child_index, true);
}
TSNode ts_node_named_child(TSNode self, uint32_t child_index) {
return ts_node__child(self, child_index, false);
}
TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) {
recur:
if (!field_id || ts_node_child_count(self) == 0) return ts_node__null();
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self.tree->language,
ts_node__subtree(self).ptr->production_id,
&field_map,
&field_map_end
);
if (field_map == field_map_end) return ts_node__null();
// The field mappings are sorted by their field id. Scan all
// the mappings to find the ones for the given field id.
while (field_map->field_id < field_id) {
field_map++;
if (field_map == field_map_end) return ts_node__null();
}
while (field_map_end[-1].field_id > field_id) {
field_map_end--;
if (field_map == field_map_end) return ts_node__null();
}
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&self);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (!ts_subtree_extra(ts_node__subtree(child))) {
uint32_t index = iterator.structural_child_index - 1;
if (index < field_map->child_index) continue;
// Hidden nodes' fields are "inherited" by their visible parent.
if (field_map->inherited) {
// If this is the *last* possible child node for this field,
// then perform a tail call to avoid recursion.
if (field_map + 1 == field_map_end) {
self = child;
goto recur;
}
// Otherwise, descend into this child, but if it doesn't contain
// the field, continue searching subsequent children.
else {
TSNode result = ts_node_child_by_field_id(child, field_id);
if (result.id) return result;
field_map++;
if (field_map == field_map_end) return ts_node__null();
}
}
else if (ts_node__is_relevant(child, true)) {
return child;
}
// If the field refers to a hidden node, return its first visible
// child.
else {
return ts_node_child(child, 0);
}
}
}
return ts_node__null();
}
TSNode ts_node_child_by_field_name(
TSNode self,
const char *name,
uint32_t name_length
) {
TSFieldId field_id = ts_language_field_id_for_name(
self.tree->language,
name,
name_length
);
return ts_node_child_by_field_id(self, field_id);
}
uint32_t ts_node_child_count(TSNode self) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
return tree.ptr->visible_child_count;
} else {
return 0;
}
}
uint32_t ts_node_named_child_count(TSNode self) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
return tree.ptr->named_child_count;
} else {
return 0;
}
}
TSNode ts_node_next_sibling(TSNode self) {
return ts_node__next_sibling(self, true);
}
TSNode ts_node_next_named_sibling(TSNode self) {
return ts_node__next_sibling(self, false);
}
TSNode ts_node_prev_sibling(TSNode self) {
return ts_node__prev_sibling(self, true);
}
TSNode ts_node_prev_named_sibling(TSNode self) {
return ts_node__prev_sibling(self, false);
}
TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) {
return ts_node__first_child_for_byte(self, byte, true);
}
TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) {
return ts_node__first_child_for_byte(self, byte, false);
}
TSNode ts_node_descendant_for_byte_range(
TSNode self,
uint32_t start,
uint32_t end
) {
return ts_node__descendant_for_byte_range(self, start, end, true);
}
TSNode ts_node_named_descendant_for_byte_range(
TSNode self,
uint32_t start,
uint32_t end
) {
return ts_node__descendant_for_byte_range(self, start, end, false);
}
TSNode ts_node_descendant_for_point_range(
TSNode self,
TSPoint start,
TSPoint end
) {
return ts_node__descendant_for_point_range(self, start, end, true);
}
TSNode ts_node_named_descendant_for_point_range(
TSNode self,
TSPoint start,
TSPoint end
) {
return ts_node__descendant_for_point_range(self, start, end, false);
}
void ts_node_edit(TSNode *self, const TSInputEdit *edit) {
uint32_t start_byte = ts_node_start_byte(*self);
TSPoint start_point = ts_node_start_point(*self);
if (start_byte >= edit->old_end_byte) {
start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte);
start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point));
} else if (start_byte > edit->start_byte) {
start_byte = edit->new_end_byte;
start_point = edit->new_end_point;
}
self->context[0] = start_byte;
self->context[1] = start_point.row;
self->context[2] = start_point.column;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
#ifndef TREE_SITTER_POINT_H_
#define TREE_SITTER_POINT_H_
#include "tree_sitter/api.h"
#define POINT_ZERO ((TSPoint) {0, 0})
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
static inline TSPoint point__new(unsigned row, unsigned column) {
TSPoint result = {row, column};
return result;
}
static inline TSPoint point_add(TSPoint a, TSPoint b) {
if (b.row > 0)
return point__new(a.row + b.row, b.column);
else
return point__new(a.row, a.column + b.column);
}
static inline TSPoint point_sub(TSPoint a, TSPoint b) {
if (a.row > b.row)
return point__new(a.row - b.row, a.column);
else
return point__new(0, a.column - b.column);
}
static inline bool point_lte(TSPoint a, TSPoint b) {
return (a.row < b.row) || (a.row == b.row && a.column <= b.column);
}
static inline bool point_lt(TSPoint a, TSPoint b) {
return (a.row < b.row) || (a.row == b.row && a.column < b.column);
}
static inline bool point_eq(TSPoint a, TSPoint b) {
return a.row == b.row && a.column == b.column;
}
static inline TSPoint point_min(TSPoint a, TSPoint b) {
if (a.row < b.row || (a.row == b.row && a.column < b.column))
return a;
else
return b;
}
static inline TSPoint point_max(TSPoint a, TSPoint b) {
if (a.row > b.row || (a.row == b.row && a.column > b.column))
return a;
else
return b;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
#ifndef TREE_SITTER_REDUCE_ACTION_H_
#define TREE_SITTER_REDUCE_ACTION_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./array.h"
#include "tree_sitter/api.h"
typedef struct {
uint32_t count;
TSSymbol symbol;
int dynamic_precedence;
unsigned short production_id;
} ReduceAction;
typedef Array(ReduceAction) ReduceActionSet;
static inline void ts_reduce_action_set_add(ReduceActionSet *self,
ReduceAction new_action) {
for (uint32_t i = 0; i < self->size; i++) {
ReduceAction action = self->contents[i];
if (action.symbol == new_action.symbol && action.count == new_action.count)
return;
}
array_push(self, new_action);
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_REDUCE_ACTION_H_

View File

@ -0,0 +1,88 @@
#include "./subtree.h"
typedef struct {
Subtree tree;
uint32_t child_index;
uint32_t byte_offset;
} StackEntry;
typedef struct {
Array(StackEntry) stack;
Subtree last_external_token;
} ReusableNode;
static inline ReusableNode reusable_node_new(void) {
return (ReusableNode) {array_new(), NULL_SUBTREE};
}
static inline void reusable_node_clear(ReusableNode *self) {
array_clear(&self->stack);
self->last_external_token = NULL_SUBTREE;
}
static inline void reusable_node_reset(ReusableNode *self, Subtree tree) {
reusable_node_clear(self);
array_push(&self->stack, ((StackEntry) {
.tree = tree,
.child_index = 0,
.byte_offset = 0,
}));
}
static inline Subtree reusable_node_tree(ReusableNode *self) {
return self->stack.size > 0
? self->stack.contents[self->stack.size - 1].tree
: NULL_SUBTREE;
}
static inline uint32_t reusable_node_byte_offset(ReusableNode *self) {
return self->stack.size > 0
? self->stack.contents[self->stack.size - 1].byte_offset
: UINT32_MAX;
}
static inline void reusable_node_delete(ReusableNode *self) {
array_delete(&self->stack);
}
static inline void reusable_node_advance(ReusableNode *self) {
StackEntry last_entry = *array_back(&self->stack);
uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree);
if (ts_subtree_has_external_tokens(last_entry.tree)) {
self->last_external_token = ts_subtree_last_external_token(last_entry.tree);
}
Subtree tree;
uint32_t next_index;
do {
StackEntry popped_entry = array_pop(&self->stack);
next_index = popped_entry.child_index + 1;
if (self->stack.size == 0) return;
tree = array_back(&self->stack)->tree;
} while (ts_subtree_child_count(tree) <= next_index);
array_push(&self->stack, ((StackEntry) {
.tree = tree.ptr->children[next_index],
.child_index = next_index,
.byte_offset = byte_offset,
}));
}
static inline bool reusable_node_descend(ReusableNode *self) {
StackEntry last_entry = *array_back(&self->stack);
if (ts_subtree_child_count(last_entry.tree) > 0) {
array_push(&self->stack, ((StackEntry) {
.tree = last_entry.tree.ptr->children[0],
.child_index = 0,
.byte_offset = last_entry.byte_offset,
}));
return true;
} else {
return false;
}
}
static inline void reusable_node_advance_past_leaf(ReusableNode *self) {
while (reusable_node_descend(self)) {}
reusable_node_advance(self);
}

View File

@ -0,0 +1,846 @@
#include "./alloc.h"
#include "./language.h"
#include "./subtree.h"
#include "./array.h"
#include "./stack.h"
#include "./length.h"
#include <assert.h>
#include <stdio.h>
#define MAX_LINK_COUNT 8
#define MAX_NODE_POOL_SIZE 50
#define MAX_ITERATOR_COUNT 64
#if defined _WIN32 && !defined __GNUC__
#define inline __forceinline
#else
#define inline static inline __attribute__((always_inline))
#endif
typedef struct StackNode StackNode;
typedef struct {
StackNode *node;
Subtree subtree;
bool is_pending;
} StackLink;
struct StackNode {
TSStateId state;
Length position;
StackLink links[MAX_LINK_COUNT];
short unsigned int link_count;
uint32_t ref_count;
unsigned error_cost;
unsigned node_count;
int dynamic_precedence;
};
typedef struct {
StackNode *node;
SubtreeArray subtrees;
uint32_t subtree_count;
bool is_pending;
} StackIterator;
typedef struct {
void *payload;
StackIterateCallback callback;
} StackIterateSession;
typedef Array(StackNode *) StackNodeArray;
typedef enum {
StackStatusActive,
StackStatusPaused,
StackStatusHalted,
} StackStatus;
typedef struct {
StackNode *node;
Subtree last_external_token;
StackSummary *summary;
unsigned node_count_at_last_error;
TSSymbol lookahead_when_paused;
StackStatus status;
} StackHead;
struct Stack {
Array(StackHead) heads;
StackSliceArray slices;
Array(StackIterator) iterators;
StackNodeArray node_pool;
StackNode *base_node;
SubtreePool *subtree_pool;
};
typedef unsigned StackAction;
enum {
StackActionNone,
StackActionStop = 1,
StackActionPop = 2,
};
typedef StackAction (*StackCallback)(void *, const StackIterator *);
static void stack_node_retain(StackNode *self) {
if (!self)
return;
assert(self->ref_count > 0);
self->ref_count++;
assert(self->ref_count != 0);
}
static void stack_node_release(StackNode *self, StackNodeArray *pool, SubtreePool *subtree_pool) {
recur:
assert(self->ref_count != 0);
self->ref_count--;
if (self->ref_count > 0) return;
StackNode *first_predecessor = NULL;
if (self->link_count > 0) {
for (unsigned i = self->link_count - 1; i > 0; i--) {
StackLink link = self->links[i];
if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree);
stack_node_release(link.node, pool, subtree_pool);
}
StackLink link = self->links[0];
if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree);
first_predecessor = self->links[0].node;
}
if (pool->size < MAX_NODE_POOL_SIZE) {
array_push(pool, self);
} else {
ts_free(self);
}
if (first_predecessor) {
self = first_predecessor;
goto recur;
}
}
static StackNode *stack_node_new(StackNode *previous_node, Subtree subtree,
bool is_pending, TSStateId state, StackNodeArray *pool) {
StackNode *node = pool->size > 0 ?
array_pop(pool) :
ts_malloc(sizeof(StackNode));
*node = (StackNode){.ref_count = 1, .link_count = 0, .state = state};
if (previous_node) {
node->link_count = 1;
node->links[0] = (StackLink){
.node = previous_node,
.subtree = subtree,
.is_pending = is_pending,
};
node->position = previous_node->position;
node->error_cost = previous_node->error_cost;
node->dynamic_precedence = previous_node->dynamic_precedence;
node->node_count = previous_node->node_count;
if (subtree.ptr) {
node->error_cost += ts_subtree_error_cost(subtree);
node->position = length_add(node->position, ts_subtree_total_size(subtree));
node->node_count += ts_subtree_node_count(subtree);
node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree);
}
} else {
node->position = length_zero();
node->error_cost = 0;
}
return node;
}
static bool stack__subtree_is_equivalent(Subtree left, Subtree right) {
return
left.ptr == right.ptr ||
(left.ptr && right.ptr &&
ts_subtree_symbol(left) == ts_subtree_symbol(right) &&
((ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) ||
(ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes &&
ts_subtree_size(left).bytes == ts_subtree_size(right).bytes &&
ts_subtree_child_count(left) == ts_subtree_child_count(right) &&
ts_subtree_extra(left) == ts_subtree_extra(right) &&
ts_subtree_external_scanner_state_eq(left, right))));
}
static void stack_node_add_link(StackNode *self, StackLink link, SubtreePool *subtree_pool) {
if (link.node == self) return;
for (int i = 0; i < self->link_count; i++) {
StackLink *existing_link = &self->links[i];
if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) {
// In general, we preserve ambiguities until they are removed from the stack
// during a pop operation where multiple paths lead to the same node. But in
// the special case where two links directly connect the same pair of nodes,
// we can safely remove the ambiguity ahead of time without changing behavior.
if (existing_link->node == link.node) {
if (
ts_subtree_dynamic_precedence(link.subtree) >
ts_subtree_dynamic_precedence(existing_link->subtree)
) {
ts_subtree_retain(link.subtree);
ts_subtree_release(subtree_pool, existing_link->subtree);
existing_link->subtree = link.subtree;
self->dynamic_precedence =
link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree);
}
return;
}
// If the previous nodes are mergeable, merge them recursively.
if (existing_link->node->state == link.node->state &&
existing_link->node->position.bytes == link.node->position.bytes) {
for (int j = 0; j < link.node->link_count; j++) {
stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool);
}
int32_t dynamic_precedence = link.node->dynamic_precedence;
if (link.subtree.ptr) {
dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree);
}
if (dynamic_precedence > self->dynamic_precedence) {
self->dynamic_precedence = dynamic_precedence;
}
return;
}
}
}
if (self->link_count == MAX_LINK_COUNT) return;
stack_node_retain(link.node);
unsigned node_count = link.node->node_count;
int dynamic_precedence = link.node->dynamic_precedence;
self->links[self->link_count++] = link;
if (link.subtree.ptr) {
ts_subtree_retain(link.subtree);
node_count += ts_subtree_node_count(link.subtree);
dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree);
}
if (node_count > self->node_count) self->node_count = node_count;
if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence;
}
static void stack_head_delete(StackHead *self, StackNodeArray *pool, SubtreePool *subtree_pool) {
if (self->node) {
if (self->last_external_token.ptr) {
ts_subtree_release(subtree_pool, self->last_external_token);
}
if (self->summary) {
array_delete(self->summary);
ts_free(self->summary);
}
stack_node_release(self->node, pool, subtree_pool);
}
}
static StackVersion ts_stack__add_version(Stack *self, StackVersion original_version,
StackNode *node) {
StackHead head = {
.node = node,
.node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error,
.last_external_token = self->heads.contents[original_version].last_external_token,
.status = StackStatusActive,
.lookahead_when_paused = 0,
};
array_push(&self->heads, head);
stack_node_retain(node);
if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token);
return (StackVersion)(self->heads.size - 1);
}
static void ts_stack__add_slice(Stack *self, StackVersion original_version,
StackNode *node, SubtreeArray *subtrees) {
for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) {
StackVersion version = self->slices.contents[i].version;
if (self->heads.contents[version].node == node) {
StackSlice slice = {*subtrees, version};
array_insert(&self->slices, i + 1, slice);
return;
}
}
StackVersion version = ts_stack__add_version(self, original_version, node);
StackSlice slice = { *subtrees, version };
array_push(&self->slices, slice);
}
inline StackSliceArray stack__iter(Stack *self, StackVersion version,
StackCallback callback, void *payload,
int goal_subtree_count) {
array_clear(&self->slices);
array_clear(&self->iterators);
StackHead *head = array_get(&self->heads, version);
StackIterator iterator = {
.node = head->node,
.subtrees = array_new(),
.subtree_count = 0,
.is_pending = true,
};
bool include_subtrees = false;
if (goal_subtree_count >= 0) {
include_subtrees = true;
array_reserve(&iterator.subtrees, goal_subtree_count);
}
array_push(&self->iterators, iterator);
while (self->iterators.size > 0) {
for (uint32_t i = 0, size = self->iterators.size; i < size; i++) {
StackIterator *iterator = &self->iterators.contents[i];
StackNode *node = iterator->node;
StackAction action = callback(payload, iterator);
bool should_pop = action & StackActionPop;
bool should_stop = action & StackActionStop || node->link_count == 0;
if (should_pop) {
SubtreeArray subtrees = iterator->subtrees;
if (!should_stop)
ts_subtree_array_copy(subtrees, &subtrees);
ts_subtree_array_reverse(&subtrees);
ts_stack__add_slice(
self,
version,
node,
&subtrees
);
}
if (should_stop) {
if (!should_pop)
ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees);
array_erase(&self->iterators, i);
i--, size--;
continue;
}
for (uint32_t j = 1; j <= node->link_count; j++) {
StackIterator *next_iterator;
StackLink link;
if (j == node->link_count) {
link = node->links[0];
next_iterator = &self->iterators.contents[i];
} else {
if (self->iterators.size >= MAX_ITERATOR_COUNT) continue;
link = node->links[j];
StackIterator current_iterator = self->iterators.contents[i];
array_push(&self->iterators, current_iterator);
next_iterator = array_back(&self->iterators);
ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees);
}
next_iterator->node = link.node;
if (link.subtree.ptr) {
if (include_subtrees) {
array_push(&next_iterator->subtrees, link.subtree);
ts_subtree_retain(link.subtree);
}
if (!ts_subtree_extra(link.subtree)) {
next_iterator->subtree_count++;
if (!link.is_pending) {
next_iterator->is_pending = false;
}
}
} else {
next_iterator->subtree_count++;
next_iterator->is_pending = false;
}
}
}
}
return self->slices;
}
Stack *ts_stack_new(SubtreePool *subtree_pool) {
Stack *self = ts_calloc(1, sizeof(Stack));
array_init(&self->heads);
array_init(&self->slices);
array_init(&self->iterators);
array_init(&self->node_pool);
array_reserve(&self->heads, 4);
array_reserve(&self->slices, 4);
array_reserve(&self->iterators, 4);
array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE);
self->subtree_pool = subtree_pool;
self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool);
ts_stack_clear(self);
return self;
}
void ts_stack_delete(Stack *self) {
if (self->slices.contents)
array_delete(&self->slices);
if (self->iterators.contents)
array_delete(&self->iterators);
stack_node_release(self->base_node, &self->node_pool, self->subtree_pool);
for (uint32_t i = 0; i < self->heads.size; i++) {
stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool);
}
array_clear(&self->heads);
if (self->node_pool.contents) {
for (uint32_t i = 0; i < self->node_pool.size; i++)
ts_free(self->node_pool.contents[i]);
array_delete(&self->node_pool);
}
array_delete(&self->heads);
ts_free(self);
}
uint32_t ts_stack_version_count(const Stack *self) {
return self->heads.size;
}
TSStateId ts_stack_state(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->state;
}
Length ts_stack_position(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->position;
}
Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->last_external_token;
}
void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) {
StackHead *head = array_get(&self->heads, version);
if (token.ptr) ts_subtree_retain(token);
if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token);
head->last_external_token = token;
}
unsigned ts_stack_error_cost(const Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
unsigned result = head->node->error_cost;
if (
head->status == StackStatusPaused ||
(head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) {
result += ERROR_COST_PER_RECOVERY;
}
return result;
}
unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
if (head->node->node_count < head->node_count_at_last_error) {
head->node_count_at_last_error = head->node->node_count;
}
return head->node->node_count - head->node_count_at_last_error;
}
void ts_stack_push(Stack *self, StackVersion version, Subtree subtree,
bool pending, TSStateId state) {
StackHead *head = array_get(&self->heads, version);
StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool);
if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count;
head->node = new_node;
}
inline StackAction iterate_callback(void *payload, const StackIterator *iterator) {
StackIterateSession *session = payload;
session->callback(
session->payload,
iterator->node->state,
iterator->subtree_count
);
return StackActionNone;
}
void ts_stack_iterate(Stack *self, StackVersion version,
StackIterateCallback callback, void *payload) {
StackIterateSession session = {payload, callback};
stack__iter(self, version, iterate_callback, &session, -1);
}
inline StackAction pop_count_callback(void *payload, const StackIterator *iterator) {
unsigned *goal_subtree_count = payload;
if (iterator->subtree_count == *goal_subtree_count) {
return StackActionPop | StackActionStop;
} else {
return StackActionNone;
}
}
StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) {
return stack__iter(self, version, pop_count_callback, &count, count);
}
inline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) {
if (iterator->subtree_count >= 1) {
if (iterator->is_pending) {
return StackActionPop | StackActionStop;
} else {
return StackActionStop;
}
} else {
return StackActionNone;
}
}
StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) {
StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0);
if (pop.size > 0) {
ts_stack_renumber_version(self, pop.contents[0].version, version);
pop.contents[0].version = version;
}
return pop;
}
inline StackAction pop_error_callback(void *payload, const StackIterator *iterator) {
if (iterator->subtrees.size > 0) {
bool *found_error = payload;
if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) {
*found_error = true;
return StackActionPop | StackActionStop;
} else {
return StackActionStop;
}
} else {
return StackActionNone;
}
}
SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) {
StackNode *node = array_get(&self->heads, version)->node;
for (unsigned i = 0; i < node->link_count; i++) {
if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) {
bool found_error = false;
StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1);
if (pop.size > 0) {
assert(pop.size == 1);
ts_stack_renumber_version(self, pop.contents[0].version, version);
return pop.contents[0].subtrees;
}
break;
}
}
return (SubtreeArray){.size = 0};
}
inline StackAction pop_all_callback(void *payload, const StackIterator *iterator) {
return iterator->node->link_count == 0 ? StackActionPop : StackActionNone;
}
StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) {
return stack__iter(self, version, pop_all_callback, NULL, 0);
}
typedef struct {
StackSummary *summary;
unsigned max_depth;
} SummarizeStackSession;
inline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) {
SummarizeStackSession *session = payload;
TSStateId state = iterator->node->state;
unsigned depth = iterator->subtree_count;
if (depth > session->max_depth) return StackActionStop;
for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) {
StackSummaryEntry entry = session->summary->contents[i];
if (entry.depth < depth) break;
if (entry.depth == depth && entry.state == state) return StackActionNone;
}
array_push(session->summary, ((StackSummaryEntry){
.position = iterator->node->position,
.depth = depth,
.state = state,
}));
return StackActionNone;
}
void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) {
SummarizeStackSession session = {
.summary = ts_malloc(sizeof(StackSummary)),
.max_depth = max_depth
};
array_init(session.summary);
stack__iter(self, version, summarize_stack_callback, &session, -1);
self->heads.contents[version].summary = session.summary;
}
StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) {
return array_get(&self->heads, version)->summary;
}
int ts_stack_dynamic_precedence(Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->dynamic_precedence;
}
bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) {
const StackHead *head = array_get(&self->heads, version);
const StackNode *node = head->node;
if (node->error_cost == 0) return true;
while (node) {
if (node->link_count > 0) {
Subtree subtree = node->links[0].subtree;
if (subtree.ptr) {
if (ts_subtree_total_bytes(subtree) > 0) {
return true;
} else if (
node->node_count > head->node_count_at_last_error &&
ts_subtree_error_cost(subtree) == 0
) {
node = node->links[0].node;
continue;
}
}
}
break;
}
return false;
}
void ts_stack_remove_version(Stack *self, StackVersion version) {
stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool);
array_erase(&self->heads, version);
}
void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) {
if (v1 == v2) return;
assert(v2 < v1);
assert((uint32_t)v1 < self->heads.size);
StackHead *source_head = &self->heads.contents[v1];
StackHead *target_head = &self->heads.contents[v2];
if (target_head->summary && !source_head->summary) {
source_head->summary = target_head->summary;
target_head->summary = NULL;
}
stack_head_delete(target_head, &self->node_pool, self->subtree_pool);
*target_head = *source_head;
array_erase(&self->heads, v1);
}
void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) {
StackHead temporary_head = self->heads.contents[v1];
self->heads.contents[v1] = self->heads.contents[v2];
self->heads.contents[v2] = temporary_head;
}
StackVersion ts_stack_copy_version(Stack *self, StackVersion version) {
assert(version < self->heads.size);
array_push(&self->heads, self->heads.contents[version]);
StackHead *head = array_back(&self->heads);
stack_node_retain(head->node);
if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token);
head->summary = NULL;
return self->heads.size - 1;
}
bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) {
if (!ts_stack_can_merge(self, version1, version2)) return false;
StackHead *head1 = &self->heads.contents[version1];
StackHead *head2 = &self->heads.contents[version2];
for (uint32_t i = 0; i < head2->node->link_count; i++) {
stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool);
}
if (head1->node->state == ERROR_STATE) {
head1->node_count_at_last_error = head1->node->node_count;
}
ts_stack_remove_version(self, version2);
return true;
}
bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) {
StackHead *head1 = &self->heads.contents[version1];
StackHead *head2 = &self->heads.contents[version2];
return
head1->status == StackStatusActive &&
head2->status == StackStatusActive &&
head1->node->state == head2->node->state &&
head1->node->position.bytes == head2->node->position.bytes &&
head1->node->error_cost == head2->node->error_cost &&
ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token);
}
void ts_stack_halt(Stack *self, StackVersion version) {
array_get(&self->heads, version)->status = StackStatusHalted;
}
void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) {
StackHead *head = array_get(&self->heads, version);
head->status = StackStatusPaused;
head->lookahead_when_paused = lookahead;
head->node_count_at_last_error = head->node->node_count;
}
bool ts_stack_is_active(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusActive;
}
bool ts_stack_is_halted(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusHalted;
}
bool ts_stack_is_paused(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusPaused;
}
TSSymbol ts_stack_resume(Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
assert(head->status == StackStatusPaused);
TSSymbol result = head->lookahead_when_paused;
head->status = StackStatusActive;
head->lookahead_when_paused = 0;
return result;
}
void ts_stack_clear(Stack *self) {
stack_node_retain(self->base_node);
for (uint32_t i = 0; i < self->heads.size; i++) {
stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool);
}
array_clear(&self->heads);
array_push(&self->heads, ((StackHead){
.node = self->base_node,
.last_external_token = NULL_SUBTREE,
.status = StackStatusActive,
.lookahead_when_paused = 0,
}));
}
bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) {
array_reserve(&self->iterators, 32);
bool was_recording_allocations = ts_toggle_allocation_recording(false);
if (!f) f = stderr;
fprintf(f, "digraph stack {\n");
fprintf(f, "rankdir=\"RL\";\n");
fprintf(f, "edge [arrowhead=none]\n");
Array(StackNode *) visited_nodes = array_new();
array_clear(&self->iterators);
for (uint32_t i = 0; i < self->heads.size; i++) {
StackHead *head = &self->heads.contents[i];
if (head->status == StackStatusHalted) continue;
fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i);
fprintf(f, "node_head_%u -> node_%p [", i, head->node);
if (head->status == StackStatusPaused) {
fprintf(f, "color=red ");
}
fprintf(f,
"label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u",
i,
ts_stack_node_count_since_error(self, i),
ts_stack_error_cost(self, i)
);
if (head->last_external_token.ptr) {
const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state;
const char *data = ts_external_scanner_state_data(state);
fprintf(f, "\nexternal_scanner_state:");
for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]);
}
fprintf(f, "\"]\n");
array_push(&self->iterators, ((StackIterator){.node = head->node }));
}
bool all_iterators_done = false;
while (!all_iterators_done) {
all_iterators_done = true;
for (uint32_t i = 0; i < self->iterators.size; i++) {
StackIterator iterator = self->iterators.contents[i];
StackNode *node = iterator.node;
for (uint32_t j = 0; j < visited_nodes.size; j++) {
if (visited_nodes.contents[j] == node) {
node = NULL;
break;
}
}
if (!node) continue;
all_iterators_done = false;
fprintf(f, "node_%p [", node);
if (node->state == ERROR_STATE) {
fprintf(f, "label=\"?\"");
} else if (
node->link_count == 1 &&
node->links[0].subtree.ptr &&
ts_subtree_extra(node->links[0].subtree)
) {
fprintf(f, "shape=point margin=0 label=\"\"");
} else {
fprintf(f, "label=\"%d\"", node->state);
}
fprintf(
f,
" tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n",
node->position.extent.row + 1,
node->position.extent.column,
node->node_count,
node->error_cost,
node->dynamic_precedence
);
for (int j = 0; j < node->link_count; j++) {
StackLink link = node->links[j];
fprintf(f, "node_%p -> node_%p [", node, link.node);
if (link.is_pending) fprintf(f, "style=dashed ");
if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray ");
if (!link.subtree.ptr) {
fprintf(f, "color=red");
} else {
fprintf(f, "label=\"");
bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree);
if (quoted) fprintf(f, "'");
const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree));
for (const char *c = name; *c; c++) {
if (*c == '\"' || *c == '\\') fprintf(f, "\\");
fprintf(f, "%c", *c);
}
if (quoted) fprintf(f, "'");
fprintf(f, "\"");
fprintf(
f,
"labeltooltip=\"error_cost: %u\ndynamic_precedence: %u\"",
ts_subtree_error_cost(link.subtree),
ts_subtree_dynamic_precedence(link.subtree)
);
}
fprintf(f, "];\n");
StackIterator *next_iterator;
if (j == 0) {
next_iterator = &self->iterators.contents[i];
} else {
array_push(&self->iterators, iterator);
next_iterator = array_back(&self->iterators);
}
next_iterator->node = link.node;
}
array_push(&visited_nodes, node);
}
}
fprintf(f, "}\n");
array_delete(&visited_nodes);
ts_toggle_allocation_recording(was_recording_allocations);
return true;
}
#undef inline

View File

@ -0,0 +1,135 @@
#ifndef TREE_SITTER_PARSE_STACK_H_
#define TREE_SITTER_PARSE_STACK_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./array.h"
#include "./subtree.h"
#include "./error_costs.h"
#include <stdio.h>
typedef struct Stack Stack;
typedef unsigned StackVersion;
#define STACK_VERSION_NONE ((StackVersion)-1)
typedef struct {
SubtreeArray subtrees;
StackVersion version;
} StackSlice;
typedef Array(StackSlice) StackSliceArray;
typedef struct {
Length position;
unsigned depth;
TSStateId state;
} StackSummaryEntry;
typedef Array(StackSummaryEntry) StackSummary;
// Create a stack.
Stack *ts_stack_new(SubtreePool *);
// Release the memory reserved for a given stack.
void ts_stack_delete(Stack *);
// Get the stack's current number of versions.
uint32_t ts_stack_version_count(const Stack *);
// Get the state at the top of the given version of the stack. If the stack is
// empty, this returns the initial state, 0.
TSStateId ts_stack_state(const Stack *, StackVersion);
// Get the last external token associated with a given version of the stack.
Subtree ts_stack_last_external_token(const Stack *, StackVersion);
// Set the last external token associated with a given version of the stack.
void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree );
// Get the position of the given version of the stack within the document.
Length ts_stack_position(const Stack *, StackVersion);
// Push a tree and state onto the given version of the stack.
//
// This transfers ownership of the tree to the Stack. Callers that
// need to retain ownership of the tree for their own purposes should
// first retain the tree.
void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId);
// Pop the given number of entries from the given version of the stack. This
// operation can increase the number of stack versions by revealing multiple
// versions which had previously been merged. It returns an array that
// specifies the index of each revealed version and the trees that were
// removed from that version.
StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count);
// Remove an error at the top of the given version of the stack.
SubtreeArray ts_stack_pop_error(Stack *, StackVersion);
// Remove any pending trees from the top of the given version of the stack.
StackSliceArray ts_stack_pop_pending(Stack *, StackVersion);
// Remove any all trees from the given version of the stack.
StackSliceArray ts_stack_pop_all(Stack *, StackVersion);
// Get the maximum number of tree nodes reachable from this version of the stack
// since the last error was detected.
unsigned ts_stack_node_count_since_error(const Stack *, StackVersion);
int ts_stack_dynamic_precedence(Stack *, StackVersion);
bool ts_stack_has_advanced_since_error(const Stack *, StackVersion);
// Compute a summary of all the parse states near the top of the given
// version of the stack and store the summary for later retrieval.
void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth);
// Retrieve a summary of all the parse states near the top of the
// given version of the stack.
StackSummary *ts_stack_get_summary(Stack *, StackVersion);
// Get the total cost of all errors on the given version of the stack.
unsigned ts_stack_error_cost(const Stack *, StackVersion version);
// Merge the given two stack versions if possible, returning true
// if they were successfully merged and false otherwise.
bool ts_stack_merge(Stack *, StackVersion, StackVersion);
// Determine whether the given two stack versions can be merged.
bool ts_stack_can_merge(Stack *, StackVersion, StackVersion);
TSSymbol ts_stack_resume(Stack *, StackVersion);
void ts_stack_pause(Stack *, StackVersion, TSSymbol);
void ts_stack_halt(Stack *, StackVersion);
bool ts_stack_is_active(const Stack *, StackVersion);
bool ts_stack_is_paused(const Stack *, StackVersion);
bool ts_stack_is_halted(const Stack *, StackVersion);
void ts_stack_renumber_version(Stack *, StackVersion, StackVersion);
void ts_stack_swap_versions(Stack *, StackVersion, StackVersion);
StackVersion ts_stack_copy_version(Stack *, StackVersion);
// Remove the given version from the stack.
void ts_stack_remove_version(Stack *, StackVersion);
void ts_stack_clear(Stack *);
bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *);
typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t);
void ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSE_STACK_H_

View File

@ -0,0 +1,980 @@
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include "./alloc.h"
#include "./atomic.h"
#include "./subtree.h"
#include "./length.h"
#include "./language.h"
#include "./error_costs.h"
#include <stddef.h>
typedef struct {
Length start;
Length old_end;
Length new_end;
} Edit;
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
#define TS_MAX_TREE_POOL_SIZE 32
static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}};
// ExternalScannerState
void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) {
self->length = length;
if (length > sizeof(self->short_data)) {
self->long_data = ts_malloc(length);
memcpy(self->long_data, data, length);
} else {
memcpy(self->short_data, data, length);
}
}
ExternalScannerState ts_external_scanner_state_copy(const ExternalScannerState *self) {
ExternalScannerState result = *self;
if (self->length > sizeof(self->short_data)) {
result.long_data = ts_malloc(self->length);
memcpy(result.long_data, self->long_data, self->length);
}
return result;
}
void ts_external_scanner_state_delete(ExternalScannerState *self) {
if (self->length > sizeof(self->short_data)) {
ts_free(self->long_data);
}
}
const char *ts_external_scanner_state_data(const ExternalScannerState *self) {
if (self->length > sizeof(self->short_data)) {
return self->long_data;
} else {
return self->short_data;
}
}
bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) {
return a == b || (
a->length == b->length &&
!memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length)
);
}
// SubtreeArray
void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) {
dest->size = self.size;
dest->capacity = self.capacity;
dest->contents = self.contents;
if (self.capacity > 0) {
dest->contents = ts_calloc(self.capacity, sizeof(Subtree));
memcpy(dest->contents, self.contents, self.size * sizeof(Subtree));
for (uint32_t i = 0; i < self.size; i++) {
ts_subtree_retain(dest->contents[i]);
}
}
}
void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) {
for (uint32_t i = 0; i < self->size; i++) {
ts_subtree_release(pool, self->contents[i]);
}
array_delete(self);
}
SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *self) {
SubtreeArray result = array_new();
uint32_t i = self->size - 1;
for (; i + 1 > 0; i--) {
Subtree child = self->contents[i];
if (!ts_subtree_extra(child)) break;
array_push(&result, child);
}
self->size = i + 1;
ts_subtree_array_reverse(&result);
return result;
}
void ts_subtree_array_reverse(SubtreeArray *self) {
for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) {
size_t reverse_index = self->size - 1 - i;
Subtree swap = self->contents[i];
self->contents[i] = self->contents[reverse_index];
self->contents[reverse_index] = swap;
}
}
// SubtreePool
SubtreePool ts_subtree_pool_new(uint32_t capacity) {
SubtreePool self = {array_new(), array_new()};
array_reserve(&self.free_trees, capacity);
return self;
}
void ts_subtree_pool_delete(SubtreePool *self) {
if (self->free_trees.contents) {
for (unsigned i = 0; i < self->free_trees.size; i++) {
ts_free(self->free_trees.contents[i].ptr);
}
array_delete(&self->free_trees);
}
if (self->tree_stack.contents) array_delete(&self->tree_stack);
}
static SubtreeHeapData *ts_subtree_pool_allocate(SubtreePool *self) {
if (self->free_trees.size > 0) {
return array_pop(&self->free_trees).ptr;
} else {
return ts_malloc(sizeof(SubtreeHeapData));
}
}
static void ts_subtree_pool_free(SubtreePool *self, SubtreeHeapData *tree) {
if (self->free_trees.capacity > 0 && self->free_trees.size + 1 <= TS_MAX_TREE_POOL_SIZE) {
array_push(&self->free_trees, (MutableSubtree) {.ptr = tree});
} else {
ts_free(tree);
}
}
// Subtree
static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t lookahead_bytes) {
return
padding.bytes < TS_MAX_INLINE_TREE_LENGTH &&
padding.extent.row < 16 &&
padding.extent.column < TS_MAX_INLINE_TREE_LENGTH &&
size.extent.row == 0 &&
size.extent.column < TS_MAX_INLINE_TREE_LENGTH &&
lookahead_bytes < 16;
}
Subtree ts_subtree_new_leaf(
SubtreePool *pool, TSSymbol symbol, Length padding, Length size,
uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens,
bool is_keyword, const TSLanguage *language
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
bool extra = symbol == ts_builtin_sym_end;
bool is_inline = (
symbol <= UINT8_MAX &&
!has_external_tokens &&
ts_subtree_can_inline(padding, size, lookahead_bytes)
);
if (is_inline) {
return (Subtree) {{
.parse_state = parse_state,
.symbol = symbol,
.padding_bytes = padding.bytes,
.padding_rows = padding.extent.row,
.padding_columns = padding.extent.column,
.size_bytes = size.bytes,
.lookahead_bytes = lookahead_bytes,
.visible = metadata.visible,
.named = metadata.named,
.extra = extra,
.has_changes = false,
.is_missing = false,
.is_keyword = is_keyword,
.is_inline = true,
}};
} else {
SubtreeHeapData *data = ts_subtree_pool_allocate(pool);
*data = (SubtreeHeapData) {
.ref_count = 1,
.padding = padding,
.size = size,
.lookahead_bytes = lookahead_bytes,
.error_cost = 0,
.child_count = 0,
.symbol = symbol,
.parse_state = parse_state,
.visible = metadata.visible,
.named = metadata.named,
.extra = extra,
.fragile_left = false,
.fragile_right = false,
.has_changes = false,
.has_external_tokens = has_external_tokens,
.is_missing = false,
.is_keyword = is_keyword,
.first_leaf = {.symbol = 0, .parse_state = 0},
};
return (Subtree) {.ptr = data};
}
}
void ts_subtree_set_symbol(
MutableSubtree *self,
TSSymbol symbol,
const TSLanguage *language
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
if (self->data.is_inline) {
assert(symbol < UINT8_MAX);
self->data.symbol = symbol;
self->data.named = metadata.named;
self->data.visible = metadata.visible;
} else {
self->ptr->symbol = symbol;
self->ptr->named = metadata.named;
self->ptr->visible = metadata.visible;
}
}
Subtree ts_subtree_new_error(
SubtreePool *pool, int32_t lookahead_char, Length padding, Length size,
uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language
) {
Subtree result = ts_subtree_new_leaf(
pool, ts_builtin_sym_error, padding, size, bytes_scanned,
parse_state, false, false, language
);
SubtreeHeapData *data = (SubtreeHeapData *)result.ptr;
data->fragile_left = true;
data->fragile_right = true;
data->lookahead_char = lookahead_char;
return result;
}
MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) {
if (self.data.is_inline) return (MutableSubtree) {self.data};
if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self);
SubtreeHeapData *result = ts_subtree_pool_allocate(pool);
memcpy(result, self.ptr, sizeof(SubtreeHeapData));
if (result->child_count > 0) {
result->children = ts_calloc(self.ptr->child_count, sizeof(Subtree));
memcpy(result->children, self.ptr->children, result->child_count * sizeof(Subtree));
for (uint32_t i = 0; i < result->child_count; i++) {
ts_subtree_retain(result->children[i]);
}
} else if (result->has_external_tokens) {
result->external_scanner_state = ts_external_scanner_state_copy(&self.ptr->external_scanner_state);
}
result->ref_count = 1;
ts_subtree_release(pool, self);
return (MutableSubtree) {.ptr = result};
}
static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLanguage *language,
MutableSubtreeArray *stack) {
unsigned initial_stack_size = stack->size;
MutableSubtree tree = self;
TSSymbol symbol = tree.ptr->symbol;
for (unsigned i = 0; i < count; i++) {
if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) break;
MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]);
if (
child.data.is_inline ||
child.ptr->child_count < 2 ||
child.ptr->ref_count > 1 ||
child.ptr->symbol != symbol
) break;
MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[0]);
if (
grandchild.data.is_inline ||
grandchild.ptr->child_count < 2 ||
grandchild.ptr->ref_count > 1 ||
grandchild.ptr->symbol != symbol
) break;
tree.ptr->children[0] = ts_subtree_from_mut(grandchild);
child.ptr->children[0] = grandchild.ptr->children[grandchild.ptr->child_count - 1];
grandchild.ptr->children[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child);
array_push(stack, tree);
tree = grandchild;
}
while (stack->size > initial_stack_size) {
tree = array_pop(stack);
MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]);
MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[child.ptr->child_count - 1]);
ts_subtree_set_children(grandchild, grandchild.ptr->children, grandchild.ptr->child_count, language);
ts_subtree_set_children(child, child.ptr->children, child.ptr->child_count, language);
ts_subtree_set_children(tree, tree.ptr->children, tree.ptr->child_count, language);
}
}
void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *language) {
array_clear(&pool->tree_stack);
if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) {
array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self));
}
while (pool->tree_stack.size > 0) {
MutableSubtree tree = array_pop(&pool->tree_stack);
if (tree.ptr->repeat_depth > 0) {
Subtree child1 = tree.ptr->children[0];
Subtree child2 = tree.ptr->children[tree.ptr->child_count - 1];
long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2);
if (repeat_delta > 0) {
unsigned n = repeat_delta;
for (unsigned i = n / 2; i > 0; i /= 2) {
ts_subtree__compress(tree, i, language, &pool->tree_stack);
n -= i;
}
}
}
for (uint32_t i = 0; i < tree.ptr->child_count; i++) {
Subtree child = tree.ptr->children[i];
if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) {
array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child));
}
}
}
}
void ts_subtree_set_children(
MutableSubtree self, Subtree *children, uint32_t child_count, const TSLanguage *language
) {
assert(!self.data.is_inline);
if (self.ptr->child_count > 0 && children != self.ptr->children) {
ts_free(self.ptr->children);
}
self.ptr->child_count = child_count;
self.ptr->children = children;
self.ptr->named_child_count = 0;
self.ptr->visible_child_count = 0;
self.ptr->error_cost = 0;
self.ptr->repeat_depth = 0;
self.ptr->node_count = 1;
self.ptr->has_external_tokens = false;
self.ptr->dynamic_precedence = 0;
uint32_t non_extra_index = 0;
const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id);
uint32_t lookahead_end_byte = 0;
for (uint32_t i = 0; i < self.ptr->child_count; i++) {
Subtree child = self.ptr->children[i];
if (i == 0) {
self.ptr->padding = ts_subtree_padding(child);
self.ptr->size = ts_subtree_size(child);
} else {
self.ptr->size = length_add(self.ptr->size, ts_subtree_total_size(child));
}
uint32_t child_lookahead_end_byte =
self.ptr->padding.bytes +
self.ptr->size.bytes +
ts_subtree_lookahead_bytes(child);
if (child_lookahead_end_byte > lookahead_end_byte) lookahead_end_byte = child_lookahead_end_byte;
if (ts_subtree_symbol(child) != ts_builtin_sym_error_repeat) {
self.ptr->error_cost += ts_subtree_error_cost(child);
}
self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child);
self.ptr->node_count += ts_subtree_node_count(child);
if (alias_sequence && alias_sequence[non_extra_index] != 0 && !ts_subtree_extra(child)) {
self.ptr->visible_child_count++;
if (ts_language_symbol_metadata(language, alias_sequence[non_extra_index]).named) {
self.ptr->named_child_count++;
}
} else if (ts_subtree_visible(child)) {
self.ptr->visible_child_count++;
if (ts_subtree_named(child)) self.ptr->named_child_count++;
} else if (ts_subtree_child_count(child) > 0) {
self.ptr->visible_child_count += child.ptr->visible_child_count;
self.ptr->named_child_count += child.ptr->named_child_count;
}
if (ts_subtree_has_external_tokens(child)) self.ptr->has_external_tokens = true;
if (ts_subtree_is_error(child)) {
self.ptr->fragile_left = self.ptr->fragile_right = true;
self.ptr->parse_state = TS_TREE_STATE_NONE;
}
if (!ts_subtree_extra(child)) non_extra_index++;
}
self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes;
if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) {
self.ptr->error_cost +=
ERROR_COST_PER_RECOVERY +
ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes +
ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row;
for (uint32_t i = 0; i < self.ptr->child_count; i++) {
Subtree child = self.ptr->children[i];
uint32_t grandchild_count = ts_subtree_child_count(child);
if (ts_subtree_extra(child)) continue;
if (ts_subtree_is_error(child) && grandchild_count == 0) continue;
if (ts_subtree_visible(child)) {
self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE;
} else if (grandchild_count > 0) {
self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count;
}
}
}
if (self.ptr->child_count > 0) {
Subtree first_child = self.ptr->children[0];
Subtree last_child = self.ptr->children[self.ptr->child_count - 1];
self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child);
self.ptr->first_leaf.parse_state = ts_subtree_leaf_parse_state(first_child);
if (ts_subtree_fragile_left(first_child)) self.ptr->fragile_left = true;
if (ts_subtree_fragile_right(last_child)) self.ptr->fragile_right = true;
if (
self.ptr->child_count >= 2 &&
!self.ptr->visible &&
!self.ptr->named &&
ts_subtree_symbol(first_child) == self.ptr->symbol
) {
if (ts_subtree_repeat_depth(first_child) > ts_subtree_repeat_depth(last_child)) {
self.ptr->repeat_depth = ts_subtree_repeat_depth(first_child) + 1;
} else {
self.ptr->repeat_depth = ts_subtree_repeat_depth(last_child) + 1;
}
}
}
}
MutableSubtree ts_subtree_new_node(SubtreePool *pool, TSSymbol symbol,
SubtreeArray *children, unsigned production_id,
const TSLanguage *language) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
bool fragile = symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat;
SubtreeHeapData *data = ts_subtree_pool_allocate(pool);
*data = (SubtreeHeapData) {
.ref_count = 1,
.symbol = symbol,
.production_id = production_id,
.visible = metadata.visible,
.named = metadata.named,
.has_changes = false,
.fragile_left = fragile,
.fragile_right = fragile,
.is_keyword = false,
.node_count = 0,
.first_leaf = {.symbol = 0, .parse_state = 0},
};
MutableSubtree result = {.ptr = data};
ts_subtree_set_children(result, children->contents, children->size, language);
return result;
}
Subtree ts_subtree_new_error_node(SubtreePool *pool, SubtreeArray *children,
bool extra, const TSLanguage *language) {
MutableSubtree result = ts_subtree_new_node(
pool, ts_builtin_sym_error, children, 0, language
);
result.ptr->extra = extra;
return ts_subtree_from_mut(result);
}
Subtree ts_subtree_new_missing_leaf(SubtreePool *pool, TSSymbol symbol, Length padding,
const TSLanguage *language) {
Subtree result = ts_subtree_new_leaf(
pool, symbol, padding, length_zero(), 0,
0, false, false, language
);
if (result.data.is_inline) {
result.data.is_missing = true;
} else {
((SubtreeHeapData *)result.ptr)->is_missing = true;
}
return result;
}
void ts_subtree_retain(Subtree self) {
if (self.data.is_inline) return;
assert(self.ptr->ref_count > 0);
atomic_inc((volatile uint32_t *)&self.ptr->ref_count);
assert(self.ptr->ref_count != 0);
}
void ts_subtree_release(SubtreePool *pool, Subtree self) {
if (self.data.is_inline) return;
array_clear(&pool->tree_stack);
assert(self.ptr->ref_count > 0);
if (atomic_dec((volatile uint32_t *)&self.ptr->ref_count) == 0) {
array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self));
}
while (pool->tree_stack.size > 0) {
MutableSubtree tree = array_pop(&pool->tree_stack);
if (tree.ptr->child_count > 0) {
for (uint32_t i = 0; i < tree.ptr->child_count; i++) {
Subtree child = tree.ptr->children[i];
if (child.data.is_inline) continue;
assert(child.ptr->ref_count > 0);
if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) {
array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child));
}
}
ts_free(tree.ptr->children);
} else if (tree.ptr->has_external_tokens) {
ts_external_scanner_state_delete(&tree.ptr->external_scanner_state);
}
ts_subtree_pool_free(pool, tree.ptr);
}
}
bool ts_subtree_eq(Subtree self, Subtree other) {
if (self.data.is_inline || other.data.is_inline) {
return memcmp(&self, &other, sizeof(SubtreeInlineData)) == 0;
}
if (self.ptr) {
if (!other.ptr) return false;
} else {
return !other.ptr;
}
if (self.ptr->symbol != other.ptr->symbol) return false;
if (self.ptr->visible != other.ptr->visible) return false;
if (self.ptr->named != other.ptr->named) return false;
if (self.ptr->padding.bytes != other.ptr->padding.bytes) return false;
if (self.ptr->size.bytes != other.ptr->size.bytes) return false;
if (self.ptr->symbol == ts_builtin_sym_error) return self.ptr->lookahead_char == other.ptr->lookahead_char;
if (self.ptr->child_count != other.ptr->child_count) return false;
if (self.ptr->child_count > 0) {
if (self.ptr->visible_child_count != other.ptr->visible_child_count) return false;
if (self.ptr->named_child_count != other.ptr->named_child_count) return false;
for (uint32_t i = 0; i < self.ptr->child_count; i++) {
if (!ts_subtree_eq(self.ptr->children[i], other.ptr->children[i])) {
return false;
}
}
}
return true;
}
int ts_subtree_compare(Subtree left, Subtree right) {
if (ts_subtree_symbol(left) < ts_subtree_symbol(right)) return -1;
if (ts_subtree_symbol(right) < ts_subtree_symbol(left)) return 1;
if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) return -1;
if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) return 1;
for (uint32_t i = 0, n = ts_subtree_child_count(left); i < n; i++) {
Subtree left_child = left.ptr->children[i];
Subtree right_child = right.ptr->children[i];
switch (ts_subtree_compare(left_child, right_child)) {
case -1: return -1;
case 1: return 1;
default: break;
}
}
return 0;
}
static inline void ts_subtree_set_has_changes(MutableSubtree *self) {
if (self->data.is_inline) {
self->data.has_changes = true;
} else {
self->ptr->has_changes = true;
}
}
Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool) {
typedef struct {
Subtree *tree;
Edit edit;
} StackEntry;
Array(StackEntry) stack = array_new();
array_push(&stack, ((StackEntry) {
.tree = &self,
.edit = (Edit) {
.start = {edit->start_byte, edit->start_point},
.old_end = {edit->old_end_byte, edit->old_end_point},
.new_end = {edit->new_end_byte, edit->new_end_point},
},
}));
while (stack.size) {
StackEntry entry = array_pop(&stack);
Edit edit = entry.edit;
bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes;
bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes;
Length size = ts_subtree_size(*entry.tree);
Length padding = ts_subtree_padding(*entry.tree);
uint32_t lookahead_bytes = ts_subtree_lookahead_bytes(*entry.tree);
uint32_t end_byte = padding.bytes + size.bytes + lookahead_bytes;
if (edit.start.bytes > end_byte || (is_noop && edit.start.bytes == end_byte)) continue;
// If the edit is entirely within the space before this subtree, then shift this
// subtree over according to the edit without changing its size.
if (edit.old_end.bytes <= padding.bytes) {
padding = length_add(edit.new_end, length_sub(padding, edit.old_end));
}
// If the edit starts in the space before this subtree and extends into this subtree,
// shrink the subtree's content to compensate for the change in the space before it.
else if (edit.start.bytes < padding.bytes) {
size = length_sub(size, length_sub(edit.old_end, padding));
padding = edit.new_end;
}
// If the edit is a pure insertion right at the start of the subtree,
// shift the subtree over according to the insertion.
else if (edit.start.bytes == padding.bytes && is_pure_insertion) {
padding = edit.new_end;
}
// If the edit is within this subtree, resize the subtree to reflect the edit.
else {
uint32_t total_bytes = padding.bytes + size.bytes;
if (edit.start.bytes < total_bytes ||
(edit.start.bytes == total_bytes && is_pure_insertion)) {
size = length_add(
length_sub(edit.new_end, padding),
length_sub(size, length_sub(edit.old_end, padding))
);
}
}
MutableSubtree result = ts_subtree_make_mut(pool, *entry.tree);
if (result.data.is_inline) {
if (ts_subtree_can_inline(padding, size, lookahead_bytes)) {
result.data.padding_bytes = padding.bytes;
result.data.padding_rows = padding.extent.row;
result.data.padding_columns = padding.extent.column;
result.data.size_bytes = size.bytes;
} else {
SubtreeHeapData *data = ts_subtree_pool_allocate(pool);
data->ref_count = 1;
data->padding = padding;
data->size = size;
data->lookahead_bytes = lookahead_bytes;
data->error_cost = 0;
data->child_count = 0;
data->symbol = result.data.symbol;
data->parse_state = result.data.parse_state;
data->visible = result.data.visible;
data->named = result.data.named;
data->extra = result.data.extra;
data->fragile_left = false;
data->fragile_right = false;
data->has_changes = false;
data->has_external_tokens = false;
data->is_missing = result.data.is_missing;
data->is_keyword = result.data.is_keyword;
result.ptr = data;
}
} else {
result.ptr->padding = padding;
result.ptr->size = size;
}
ts_subtree_set_has_changes(&result);
*entry.tree = ts_subtree_from_mut(result);
Length child_left, child_right = length_zero();
for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; i++) {
Subtree *child = &result.ptr->children[i];
Length child_size = ts_subtree_total_size(*child);
child_left = child_right;
child_right = length_add(child_left, child_size);
// If this child ends before the edit, it is not affected.
if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue;
// If this child starts after the edit, then we're done processing children.
if (child_left.bytes > edit.old_end.bytes ||
(child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)) break;
// Transform edit into the child's coordinate space.
Edit child_edit = {
.start = length_sub(edit.start, child_left),
.old_end = length_sub(edit.old_end, child_left),
.new_end = length_sub(edit.new_end, child_left),
};
// Clamp child_edit to the child's bounds.
if (edit.start.bytes < child_left.bytes) child_edit.start = length_zero();
if (edit.old_end.bytes < child_left.bytes) child_edit.old_end = length_zero();
if (edit.new_end.bytes < child_left.bytes) child_edit.new_end = length_zero();
if (edit.old_end.bytes > child_right.bytes) child_edit.old_end = child_size;
// Interpret all inserted text as applying to the *first* child that touches the edit.
// Subsequent children are only never have any text inserted into them; they are only
// shrunk to compensate for the edit.
if (child_right.bytes > edit.start.bytes ||
(child_right.bytes == edit.start.bytes && is_pure_insertion)) {
edit.new_end = edit.start;
}
// Children that occur before the edit are not reshaped by the edit.
else {
child_edit.old_end = child_edit.start;
child_edit.new_end = child_edit.start;
}
// Queue processing of this child's subtree.
array_push(&stack, ((StackEntry) {
.tree = child,
.edit = child_edit,
}));
}
}
array_delete(&stack);
return self;
}
Subtree ts_subtree_last_external_token(Subtree tree) {
if (!ts_subtree_has_external_tokens(tree)) return NULL_SUBTREE;
while (tree.ptr->child_count > 0) {
for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) {
Subtree child = tree.ptr->children[i];
if (ts_subtree_has_external_tokens(child)) {
tree = child;
break;
}
}
}
return tree;
}
static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
if (c == -1)
return snprintf(s, n, "INVALID");
else if (c == '\0')
return snprintf(s, n, "'\\0'");
else if (c == '\n')
return snprintf(s, n, "'\\n'");
else if (c == '\t')
return snprintf(s, n, "'\\t'");
else if (c == '\r')
return snprintf(s, n, "'\\r'");
else if (0 < c && c < 128 && isprint(c))
return snprintf(s, n, "'%c'", c);
else
return snprintf(s, n, "%d", c);
}
static void ts_subtree__write_dot_string(FILE *f, const char *string) {
for (const char *c = string; *c; c++) {
if (*c == '"') {
fputs("\\\"", f);
} else if (*c == '\n') {
fputs("\\n", f);
} else {
fputc(*c, f);
}
}
}
static const char *ROOT_FIELD = "__ROOT__";
static size_t ts_subtree__write_to_string(
Subtree self, char *string, size_t limit,
const TSLanguage *language, bool include_all,
TSSymbol alias_symbol, bool alias_is_named, const char *field_name
) {
if (!self.ptr) return snprintf(string, limit, "(NULL)");
char *cursor = string;
char **writer = (limit > 0) ? &cursor : &string;
bool is_root = field_name == ROOT_FIELD;
bool is_visible =
include_all ||
ts_subtree_missing(self) ||
(
alias_symbol
? alias_is_named
: ts_subtree_visible(self) && ts_subtree_named(self)
);
if (is_visible) {
if (!is_root) {
cursor += snprintf(*writer, limit, " ");
if (field_name) {
cursor += snprintf(*writer, limit, "%s: ", field_name);
}
}
if (ts_subtree_is_error(self) && ts_subtree_child_count(self) == 0 && self.ptr->size.bytes > 0) {
cursor += snprintf(*writer, limit, "(UNEXPECTED ");
cursor += ts_subtree__write_char_to_string(*writer, limit, self.ptr->lookahead_char);
} else {
TSSymbol symbol = alias_symbol ? alias_symbol : ts_subtree_symbol(self);
const char *symbol_name = ts_language_symbol_name(language, symbol);
if (ts_subtree_missing(self)) {
cursor += snprintf(*writer, limit, "(MISSING ");
if (alias_is_named || ts_subtree_named(self)) {
cursor += snprintf(*writer, limit, "%s", symbol_name);
} else {
cursor += snprintf(*writer, limit, "\"%s\"", symbol_name);
}
} else {
cursor += snprintf(*writer, limit, "(%s", symbol_name);
}
}
} else if (is_root) {
TSSymbol symbol = ts_subtree_symbol(self);
const char *symbol_name = ts_language_symbol_name(language, symbol);
cursor += snprintf(*writer, limit, "(\"%s\")", symbol_name);
}
if (ts_subtree_child_count(self)) {
const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id);
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
language,
self.ptr->production_id,
&field_map,
&field_map_end
);
uint32_t structural_child_index = 0;
for (uint32_t i = 0; i < self.ptr->child_count; i++) {
Subtree child = self.ptr->children[i];
if (ts_subtree_extra(child)) {
cursor += ts_subtree__write_to_string(
child, *writer, limit,
language, include_all,
0, false, NULL
);
} else {
TSSymbol alias_symbol = alias_sequence
? alias_sequence[structural_child_index]
: 0;
bool alias_is_named = alias_symbol
? ts_language_symbol_metadata(language, alias_symbol).named
: false;
const char *child_field_name = is_visible ? NULL : field_name;
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (!i->inherited && i->child_index == structural_child_index) {
child_field_name = language->field_names[i->field_id];
break;
}
}
cursor += ts_subtree__write_to_string(
child, *writer, limit,
language, include_all,
alias_symbol, alias_is_named, child_field_name
);
structural_child_index++;
}
}
}
if (is_visible) cursor += snprintf(*writer, limit, ")");
return cursor - string;
}
char *ts_subtree_string(
Subtree self,
const TSLanguage *language,
bool include_all
) {
char scratch_string[1];
size_t size = ts_subtree__write_to_string(
self, scratch_string, 0,
language, include_all,
0, false, ROOT_FIELD
) + 1;
char *result = malloc(size * sizeof(char));
ts_subtree__write_to_string(
self, result, size,
language, include_all,
0, false, ROOT_FIELD
);
return result;
}
void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset,
const TSLanguage *language, TSSymbol alias_symbol,
FILE *f) {
TSSymbol subtree_symbol = ts_subtree_symbol(*self);
TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol;
uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self);
fprintf(f, "tree_%p [label=\"", self);
ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol));
fprintf(f, "\"");
if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext");
if (ts_subtree_extra(*self)) fprintf(f, ", fontcolor=gray");
fprintf(f, ", tooltip=\""
"range: %u - %u\n"
"state: %d\n"
"error-cost: %u\n"
"has-changes: %u\n"
"repeat-depth: %u\n"
"lookahead-bytes: %u",
start_offset, end_offset,
ts_subtree_parse_state(*self),
ts_subtree_error_cost(*self),
ts_subtree_has_changes(*self),
ts_subtree_repeat_depth(*self),
ts_subtree_lookahead_bytes(*self)
);
if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0) {
fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char);
}
fprintf(f, "\"]\n");
uint32_t child_start_offset = start_offset;
uint32_t child_info_offset =
language->max_alias_sequence_length *
ts_subtree_production_id(*self);
for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) {
const Subtree *child = &self->ptr->children[i];
TSSymbol alias_symbol = 0;
if (!ts_subtree_extra(*child) && child_info_offset) {
alias_symbol = language->alias_sequences[child_info_offset];
child_info_offset++;
}
ts_subtree__print_dot_graph(child, child_start_offset, language, alias_symbol, f);
fprintf(f, "tree_%p -> tree_%p [tooltip=%u]\n", self, child, i);
child_start_offset += ts_subtree_total_bytes(*child);
}
}
void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f) {
fprintf(f, "digraph tree {\n");
fprintf(f, "edge [arrowhead=none]\n");
ts_subtree__print_dot_graph(&self, 0, language, 0, f);
fprintf(f, "}\n");
}
bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) {
const ExternalScannerState *state1 = &empty_state;
const ExternalScannerState *state2 = &empty_state;
if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) {
state1 = &self.ptr->external_scanner_state;
}
if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) {
state2 = &other.ptr->external_scanner_state;
}
return ts_external_scanner_state_eq(state1, state2);
}

View File

@ -0,0 +1,285 @@
#ifndef TREE_SITTER_SUBTREE_H_
#define TREE_SITTER_SUBTREE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include "./length.h"
#include "./array.h"
#include "./error_costs.h"
#include "tree_sitter/api.h"
#include "tree_sitter/parser.h"
static const TSStateId TS_TREE_STATE_NONE = USHRT_MAX;
#define NULL_SUBTREE ((Subtree) {.ptr = NULL})
typedef union Subtree Subtree;
typedef union MutableSubtree MutableSubtree;
typedef struct {
union {
char *long_data;
char short_data[24];
};
uint32_t length;
} ExternalScannerState;
typedef struct {
bool is_inline : 1;
bool visible : 1;
bool named : 1;
bool extra : 1;
bool has_changes : 1;
bool is_missing : 1;
bool is_keyword : 1;
uint8_t symbol;
uint8_t padding_bytes;
uint8_t size_bytes;
uint8_t padding_columns;
uint8_t padding_rows : 4;
uint8_t lookahead_bytes : 4;
uint16_t parse_state;
} SubtreeInlineData;
typedef struct {
volatile uint32_t ref_count;
Length padding;
Length size;
uint32_t lookahead_bytes;
uint32_t error_cost;
uint32_t child_count;
TSSymbol symbol;
TSStateId parse_state;
bool visible : 1;
bool named : 1;
bool extra : 1;
bool fragile_left : 1;
bool fragile_right : 1;
bool has_changes : 1;
bool has_external_tokens : 1;
bool is_missing : 1;
bool is_keyword : 1;
union {
// Non-terminal subtrees (`child_count > 0`)
struct {
Subtree *children;
uint32_t visible_child_count;
uint32_t named_child_count;
uint32_t node_count;
uint32_t repeat_depth;
int32_t dynamic_precedence;
uint16_t production_id;
struct {
TSSymbol symbol;
TSStateId parse_state;
} first_leaf;
};
// External terminal subtrees (`child_count == 0 && has_external_tokens`)
ExternalScannerState external_scanner_state;
// Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`)
int32_t lookahead_char;
};
} SubtreeHeapData;
union Subtree {
SubtreeInlineData data;
const SubtreeHeapData *ptr;
};
union MutableSubtree {
SubtreeInlineData data;
SubtreeHeapData *ptr;
};
typedef Array(Subtree) SubtreeArray;
typedef Array(MutableSubtree) MutableSubtreeArray;
typedef struct {
MutableSubtreeArray free_trees;
MutableSubtreeArray tree_stack;
} SubtreePool;
void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned);
const char *ts_external_scanner_state_data(const ExternalScannerState *);
void ts_subtree_array_copy(SubtreeArray, SubtreeArray *);
void ts_subtree_array_delete(SubtreePool *, SubtreeArray *);
SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *);
void ts_subtree_array_reverse(SubtreeArray *);
SubtreePool ts_subtree_pool_new(uint32_t capacity);
void ts_subtree_pool_delete(SubtreePool *);
Subtree ts_subtree_new_leaf(
SubtreePool *, TSSymbol, Length, Length, uint32_t,
TSStateId, bool, bool, const TSLanguage *
);
Subtree ts_subtree_new_error(
SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage *
);
MutableSubtree ts_subtree_new_node(SubtreePool *, TSSymbol, SubtreeArray *, unsigned, const TSLanguage *);
Subtree ts_subtree_new_error_node(SubtreePool *, SubtreeArray *, bool, const TSLanguage *);
Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, const TSLanguage *);
MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree);
void ts_subtree_retain(Subtree);
void ts_subtree_release(SubtreePool *, Subtree);
bool ts_subtree_eq(Subtree, Subtree);
int ts_subtree_compare(Subtree, Subtree);
void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *);
void ts_subtree_set_children(MutableSubtree, Subtree *, uint32_t, const TSLanguage *);
void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *);
Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *);
char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all);
void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *);
Subtree ts_subtree_last_external_token(Subtree);
bool ts_subtree_external_scanner_state_eq(Subtree, Subtree);
#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name)
static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); }
static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); }
static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); }
static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); }
static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); }
static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); }
static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); }
static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); }
static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); }
#undef SUBTREE_GET
static inline void ts_subtree_set_extra(MutableSubtree *self) {
if (self->data.is_inline) {
self->data.extra = true;
} else {
self->ptr->extra = true;
}
}
static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) {
if (self.data.is_inline) return self.data.symbol;
if (self.ptr->child_count == 0) return self.ptr->symbol;
return self.ptr->first_leaf.symbol;
}
static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) {
if (self.data.is_inline) return self.data.parse_state;
if (self.ptr->child_count == 0) return self.ptr->parse_state;
return self.ptr->first_leaf.parse_state;
}
static inline Length ts_subtree_padding(Subtree self) {
if (self.data.is_inline) {
Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}};
return result;
} else {
return self.ptr->padding;
}
}
static inline Length ts_subtree_size(Subtree self) {
if (self.data.is_inline) {
Length result = {self.data.size_bytes, {0, self.data.size_bytes}};
return result;
} else {
return self.ptr->size;
}
}
static inline Length ts_subtree_total_size(Subtree self) {
return length_add(ts_subtree_padding(self), ts_subtree_size(self));
}
static inline uint32_t ts_subtree_total_bytes(Subtree self) {
return ts_subtree_total_size(self).bytes;
}
static inline uint32_t ts_subtree_child_count(Subtree self) {
return self.data.is_inline ? 0 : self.ptr->child_count;
}
static inline uint32_t ts_subtree_repeat_depth(Subtree self) {
return self.data.is_inline ? 0 : self.ptr->repeat_depth;
}
static inline uint32_t ts_subtree_node_count(Subtree self) {
return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count;
}
static inline uint32_t ts_subtree_visible_child_count(Subtree self) {
if (ts_subtree_child_count(self) > 0) {
return self.ptr->visible_child_count;
} else {
return 0;
}
}
static inline uint32_t ts_subtree_error_cost(Subtree self) {
if (ts_subtree_missing(self)) {
return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY;
} else {
return self.data.is_inline ? 0 : self.ptr->error_cost;
}
}
static inline int32_t ts_subtree_dynamic_precedence(Subtree self) {
return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence;
}
static inline uint16_t ts_subtree_production_id(Subtree self) {
if (ts_subtree_child_count(self) > 0) {
return self.ptr->production_id;
} else {
return 0;
}
}
static inline bool ts_subtree_fragile_left(Subtree self) {
return self.data.is_inline ? false : self.ptr->fragile_left;
}
static inline bool ts_subtree_fragile_right(Subtree self) {
return self.data.is_inline ? false : self.ptr->fragile_right;
}
static inline bool ts_subtree_has_external_tokens(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_tokens;
}
static inline bool ts_subtree_is_fragile(Subtree self) {
return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right);
}
static inline bool ts_subtree_is_error(Subtree self) {
return ts_subtree_symbol(self) == ts_builtin_sym_error;
}
static inline bool ts_subtree_is_eof(Subtree self) {
return ts_subtree_symbol(self) == ts_builtin_sym_end;
}
static inline Subtree ts_subtree_from_mut(MutableSubtree self) {
Subtree result;
result.data = self.data;
return result;
}
static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) {
MutableSubtree result;
result.data = self.data;
return result;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_SUBTREE_H_

View File

@ -0,0 +1,148 @@
#include "tree_sitter/api.h"
#include "./array.h"
#include "./get_changed_ranges.h"
#include "./subtree.h"
#include "./tree_cursor.h"
#include "./tree.h"
static const unsigned PARENT_CACHE_CAPACITY = 32;
TSTree *ts_tree_new(
Subtree root, const TSLanguage *language,
const TSRange *included_ranges, unsigned included_range_count
) {
TSTree *result = ts_malloc(sizeof(TSTree));
result->root = root;
result->language = language;
result->parent_cache = NULL;
result->parent_cache_start = 0;
result->parent_cache_size = 0;
result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange));
memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange));
result->included_range_count = included_range_count;
return result;
}
TSTree *ts_tree_copy(const TSTree *self) {
ts_subtree_retain(self->root);
return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count);
}
void ts_tree_delete(TSTree *self) {
if (!self) return;
SubtreePool pool = ts_subtree_pool_new(0);
ts_subtree_release(&pool, self->root);
ts_subtree_pool_delete(&pool);
ts_free(self->included_ranges);
if (self->parent_cache) ts_free(self->parent_cache);
ts_free(self);
}
TSNode ts_tree_root_node(const TSTree *self) {
return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0);
}
const TSLanguage *ts_tree_language(const TSTree *self) {
return self->language;
}
void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *range = &self->included_ranges[i];
if (range->end_byte >= edit->old_end_byte) {
if (range->end_byte != UINT32_MAX) {
range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte);
range->end_point = point_add(
edit->new_end_point,
point_sub(range->end_point, edit->old_end_point)
);
if (range->end_byte < edit->new_end_byte) {
range->end_byte = UINT32_MAX;
range->end_point = POINT_MAX;
}
}
if (range->start_byte >= edit->old_end_byte) {
range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte);
range->start_point = point_add(
edit->new_end_point,
point_sub(range->start_point, edit->old_end_point)
);
if (range->start_byte < edit->new_end_byte) {
range->start_byte = UINT32_MAX;
range->start_point = POINT_MAX;
}
}
}
}
SubtreePool pool = ts_subtree_pool_new(0);
self->root = ts_subtree_edit(self->root, edit, &pool);
self->parent_cache_start = 0;
self->parent_cache_size = 0;
ts_subtree_pool_delete(&pool);
}
TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) {
TreeCursor cursor1 = {NULL, array_new()};
TreeCursor cursor2 = {NULL, array_new()};
ts_tree_cursor_init(&cursor1, ts_tree_root_node(self));
ts_tree_cursor_init(&cursor2, ts_tree_root_node(other));
TSRangeArray included_range_differences = array_new();
ts_range_array_get_changed_ranges(
self->included_ranges, self->included_range_count,
other->included_ranges, other->included_range_count,
&included_range_differences
);
TSRange *result;
*count = ts_subtree_get_changed_ranges(
&self->root, &other->root, &cursor1, &cursor2,
self->language, &included_range_differences, &result
);
array_delete(&included_range_differences);
array_delete(&cursor1.stack);
array_delete(&cursor2.stack);
return result;
}
void ts_tree_print_dot_graph(const TSTree *self, FILE *file) {
ts_subtree_print_dot_graph(self->root, self->language, file);
}
TSNode ts_tree_get_cached_parent(const TSTree *self, const TSNode *node) {
for (uint32_t i = 0; i < self->parent_cache_size; i++) {
uint32_t index = (self->parent_cache_start + i) % PARENT_CACHE_CAPACITY;
ParentCacheEntry *entry = &self->parent_cache[index];
if (entry->child == node->id) {
return ts_node_new(self, entry->parent, entry->position, entry->alias_symbol);
}
}
return ts_node_new(NULL, NULL, length_zero(), 0);
}
void ts_tree_set_cached_parent(const TSTree *_self, const TSNode *node, const TSNode *parent) {
TSTree *self = (TSTree *)_self;
if (!self->parent_cache) {
self->parent_cache = ts_calloc(PARENT_CACHE_CAPACITY, sizeof(ParentCacheEntry));
}
uint32_t index = (self->parent_cache_start + self->parent_cache_size) % PARENT_CACHE_CAPACITY;
self->parent_cache[index] = (ParentCacheEntry) {
.child = node->id,
.parent = (const Subtree *)parent->id,
.position = {
parent->context[0],
{parent->context[1], parent->context[2]}
},
.alias_symbol = parent->context[3],
};
if (self->parent_cache_size == PARENT_CACHE_CAPACITY) {
self->parent_cache_start++;
} else {
self->parent_cache_size++;
}
}

View File

@ -0,0 +1,34 @@
#ifndef TREE_SITTER_TREE_H_
#define TREE_SITTER_TREE_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const Subtree *child;
const Subtree *parent;
Length position;
TSSymbol alias_symbol;
} ParentCacheEntry;
struct TSTree {
Subtree root;
const TSLanguage *language;
ParentCacheEntry *parent_cache;
uint32_t parent_cache_start;
uint32_t parent_cache_size;
TSRange *included_ranges;
unsigned included_range_count;
};
TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned);
TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol);
TSNode ts_tree_get_cached_parent(const TSTree *, const TSNode *);
void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_TREE_H_

View File

@ -0,0 +1,367 @@
#include "tree_sitter/api.h"
#include "./alloc.h"
#include "./tree_cursor.h"
#include "./language.h"
#include "./tree.h"
typedef struct {
Subtree parent;
const TSTree *tree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
const TSSymbol *alias_sequence;
} CursorChildIterator;
// CursorChildIterator
static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) {
TreeCursorEntry *last_entry = array_back(&self->stack);
if (ts_subtree_child_count(*last_entry->subtree) == 0) {
return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, NULL};
}
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
last_entry->subtree->ptr->production_id
);
return (CursorChildIterator) {
.tree = self->tree,
.parent = *last_entry->subtree,
.position = last_entry->position,
.child_index = 0,
.structural_child_index = 0,
.alias_sequence = alias_sequence,
};
}
static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self,
TreeCursorEntry *result,
bool *visible) {
if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false;
const Subtree *child = &self->parent.ptr->children[self->child_index];
*result = (TreeCursorEntry) {
.subtree = child,
.position = self->position,
.child_index = self->child_index,
.structural_child_index = self->structural_child_index,
};
*visible = ts_subtree_visible(*child);
bool extra = ts_subtree_extra(*child);
if (!extra && self->alias_sequence) {
*visible |= self->alias_sequence[self->structural_child_index];
self->structural_child_index++;
}
self->position = length_add(self->position, ts_subtree_size(*child));
self->child_index++;
if (self->child_index < self->parent.ptr->child_count) {
Subtree next_child = self->parent.ptr->children[self->child_index];
self->position = length_add(self->position, ts_subtree_padding(next_child));
}
return true;
}
// TSTreeCursor - lifecycle
TSTreeCursor ts_tree_cursor_new(TSNode node) {
TSTreeCursor self = {NULL, NULL, {0, 0}};
ts_tree_cursor_init((TreeCursor *)&self, node);
return self;
}
void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) {
ts_tree_cursor_init((TreeCursor *)_self, node);
}
void ts_tree_cursor_init(TreeCursor *self, TSNode node) {
self->tree = node.tree;
array_clear(&self->stack);
array_push(&self->stack, ((TreeCursorEntry) {
.subtree = (const Subtree *)node.id,
.position = {
ts_node_start_byte(node),
ts_node_start_point(node)
},
.child_index = 0,
.structural_child_index = 0,
}));
}
void ts_tree_cursor_delete(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
array_delete(&self->stack);
}
// TSTreeCursor - walking the tree
bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
bool did_descend;
do {
did_descend = false;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
if (visible) {
array_push(&self->stack, entry);
return true;
}
if (ts_subtree_visible_child_count(*entry.subtree) > 0) {
array_push(&self->stack, entry);
did_descend = true;
break;
}
}
} while (did_descend);
return false;
}
int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) {
TreeCursor *self = (TreeCursor *)_self;
uint32_t initial_size = self->stack.size;
uint32_t visible_child_index = 0;
bool did_descend;
do {
did_descend = false;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes;
bool at_goal = end_byte > goal_byte;
uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree);
if (at_goal) {
if (visible) {
array_push(&self->stack, entry);
return visible_child_index;
}
if (visible_child_count > 0) {
array_push(&self->stack, entry);
did_descend = true;
break;
}
} else if (visible) {
visible_child_index++;
} else {
visible_child_index += visible_child_count;
}
}
} while (did_descend);
if (self->stack.size > initial_size &&
ts_tree_cursor_goto_next_sibling((TSTreeCursor *)self)) {
return visible_child_index;
}
self->stack.size = initial_size;
return -1;
}
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
uint32_t initial_size = self->stack.size;
while (self->stack.size > 1) {
TreeCursorEntry entry = array_pop(&self->stack);
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
iterator.child_index = entry.child_index;
iterator.structural_child_index = entry.structural_child_index;
iterator.position = entry.position;
bool visible = false;
ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible);
if (visible && self->stack.size + 1 < initial_size) break;
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
if (visible) {
array_push(&self->stack, entry);
return true;
}
if (ts_subtree_visible_child_count(*entry.subtree)) {
array_push(&self->stack, entry);
ts_tree_cursor_goto_first_child(_self);
return true;
}
}
}
self->stack.size = initial_size;
return false;
}
bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) {
TreeCursorEntry *entry = &self->stack.contents[i];
bool is_aliased = false;
if (i > 0) {
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
is_aliased = alias_sequence && alias_sequence[entry->structural_child_index];
}
if (ts_subtree_visible(*entry->subtree) || is_aliased) {
self->stack.size = i + 1;
return true;
}
}
return false;
}
TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
TreeCursorEntry *last_entry = array_back(&self->stack);
TSSymbol alias_symbol = 0;
if (self->stack.size > 1) {
TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2];
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && !ts_subtree_extra(*last_entry->subtree)) {
alias_symbol = alias_sequence[last_entry->structural_child_index];
}
}
return ts_node_new(
self->tree,
last_entry->subtree,
last_entry->position,
alias_symbol
);
}
TSFieldId ts_tree_cursor_current_status(
const TSTreeCursor *_self,
bool *can_have_later_siblings,
bool *can_have_later_siblings_with_this_field
) {
const TreeCursor *self = (const TreeCursor *)_self;
TSFieldId result = 0;
*can_have_later_siblings = false;
*can_have_later_siblings_with_this_field = false;
// Walk up the tree, visiting the current node and its invisible ancestors,
// because fields can refer to nodes through invisible *wrapper* nodes,
for (unsigned i = self->stack.size - 1; i > 0; i--) {
TreeCursorEntry *entry = &self->stack.contents[i];
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
// Stop walking up when a visible ancestor is found.
if (i != self->stack.size - 1) {
if (ts_subtree_visible(*entry->subtree)) break;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
break;
}
}
if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) {
*can_have_later_siblings = true;
}
if (ts_subtree_extra(*entry->subtree)) break;
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
// Look for a field name associated with the current node.
if (!result) {
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (!i->inherited && i->child_index == entry->structural_child_index) {
result = i->field_id;
*can_have_later_siblings_with_this_field = false;
break;
}
}
}
// Determine if there other later siblings with the same field name.
if (result) {
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (i->field_id == result && i->child_index > entry->structural_child_index) {
*can_have_later_siblings_with_this_field = true;
break;
}
}
}
}
return result;
}
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
// Walk up the tree, visiting the current node and its invisible ancestors.
for (unsigned i = self->stack.size - 1; i > 0; i--) {
TreeCursorEntry *entry = &self->stack.contents[i];
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
// Stop walking up when another visible node is found.
if (i != self->stack.size - 1) {
if (ts_subtree_visible(*entry->subtree)) break;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
break;
}
}
if (ts_subtree_extra(*entry->subtree)) break;
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (!i->inherited && i->child_index == entry->structural_child_index) {
return i->field_id;
}
}
}
return 0;
}
const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) {
TSFieldId id = ts_tree_cursor_current_field_id(_self);
if (id) {
const TreeCursor *self = (const TreeCursor *)_self;
return self->tree->language->field_names[id];
} else {
return NULL;
}
}
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) {
const TreeCursor *cursor = (const TreeCursor *)_cursor;
TSTreeCursor res = {NULL, NULL, {0, 0}};
TreeCursor *copy = (TreeCursor *)&res;
copy->tree = cursor->tree;
array_push_all(&copy->stack, &cursor->stack);
return res;
}

View File

@ -0,0 +1,21 @@
#ifndef TREE_SITTER_TREE_CURSOR_H_
#define TREE_SITTER_TREE_CURSOR_H_
#include "./subtree.h"
typedef struct {
const Subtree *subtree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
} TreeCursorEntry;
typedef struct {
const TSTree *tree;
Array(TreeCursorEntry) stack;
} TreeCursor;
void ts_tree_cursor_init(TreeCursor *, TSNode);
TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *);
#endif // TREE_SITTER_TREE_CURSOR_H_

View File

@ -0,0 +1,50 @@
#ifndef TREE_SITTER_UNICODE_H_
#define TREE_SITTER_UNICODE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <limits.h>
#include <stdint.h>
#define U_EXPORT
#define U_EXPORT2
#include "unicode/utf8.h"
#include "unicode/utf16.h"
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
// These functions read one unicode code point from the given string,
// returning the number of bytes consumed.
typedef uint32_t (*UnicodeDecodeFunction)(
const uint8_t *string,
uint32_t length,
int32_t *code_point
);
static inline uint32_t ts_decode_utf8(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U8_NEXT(string, i, length, *code_point);
return i;
}
static inline uint32_t ts_decode_utf16(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT(((uint16_t *)string), i, length, *code_point);
return i * 2;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_UNICODE_H_

View File

@ -0,0 +1 @@
552b01f61127d30d6589aa4bf99468224979b661

View File

@ -0,0 +1,414 @@
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
---------------------
Third-Party Software Licenses
This section contains third-party software notices and/or additional
terms for licensed third-party software components included within ICU
libraries.
1. ICU License - ICU 1.8.1 to ICU 57.1
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2016 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies of
the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale, use
or other dealings in this Software without prior written authorization
of the copyright holder.
All trademarks and registered trademarks mentioned herein are the
property of their respective owners.
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
# The Google Chrome software developed by Google is licensed under
# the BSD license. Other software included in this distribution is
# provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution.
# Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists
# listed below with further processing for compound word breaking. The
# frequency is generated with an iterative training against Google web
# corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia
# * Sinica. All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
# University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END--------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN-------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END----------------------------------
3. Lao Word Break Dictionary Data (laodict.txt)
# Copyright (c) 2013 International Business Machines Corporation
# and others. All Rights Reserved.
#
# Project: http://code.google.com/p/lao-dictionary/
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
# (copied below)
#
# This file is derived from the above dictionary, with slight
# modifications.
# ----------------------------------------------------------------------
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification,
# are permitted provided that the following conditions are met:
#
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in
# binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------
4. Burmese Word Break Dictionary Data (burmesedict.txt)
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer. Redistributions in binary form must reproduce the
# above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
# --------------------------------------------------------------------------
5. Time Zone Database
ICU uses the public domain data and code derived from Time Zone
Database for its time zone support. The ownership of the TZ database
is explained in BCP 175: Procedure for Maintaining the Time Zone
Database section 7.
# 7. Database Ownership
#
# The TZ database itself is not an IETF Contribution or an IETF
# document. Rather it is a pre-existing and regularly updated work
# that is in the public domain, and is intended to remain in the
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
# not apply to the TZ Database or contributions that individuals make
# to it. Should any claims be made and substantiated against the TZ
# Database, the organization that is providing the IANA
# Considerations defined in this RFC, under the memorandum of
# understanding with the IETF, currently ICANN, may act in accordance
# with all competent court orders. No ownership claims will be made
# by ICANN or the IETF Trust on the database or the code. Any person
# making a contribution to the database or code waives all rights to
# future claims in that contribution or in the TZ Database.
6. Google double-conversion
Copyright 2006-2011, the V8 project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,29 @@
# ICU Parts
This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
### License
The license for these files is contained in the `LICENSE` file within this directory.
### Contents
* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
* `utf8.h`
* `utf16.h`
* `umachine.h`
* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
* `ptypes.h`
* `urename.h`
* `utf.h`
* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
* `README.md` - This text file.
### Updating ICU
To incorporate changes from the upstream `icu` repository:
* Update `ICU_SHA` with the new Git SHA.
* Update `LICENSE` with the license text from the directory mentioned above.
* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,448 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: umachine.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file defines basic types and constants for ICU to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*/
#ifndef __UMACHINE_H__
#define __UMACHINE_H__
/**
* \file
* \brief Basic types and constants for UTF
*
* <h2> Basic types and constants for UTF </h2>
* This file defines basic types and constants for utf.h to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*
*/
/*==========================================================================*/
/* Include platform-dependent definitions */
/* which are contained in the platform-specific file platform.h */
/*==========================================================================*/
#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */
/*
* ANSI C headers:
* stddef.h defines wchar_t
*/
#include <stddef.h>
/*==========================================================================*/
/* For C wrappers, we use the symbol U_STABLE. */
/* This works properly if the includer is C or C++. */
/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
/*==========================================================================*/
/**
* \def U_CFUNC
* This is used in a declaration of a library private ICU C function.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_BEGIN
* This is used to begin a declaration of a library private ICU C API.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_END
* This is used to end a declaration of a library private ICU C API
* @stable ICU 2.4
*/
#ifdef __cplusplus
# define U_CFUNC extern "C"
# define U_CDECL_BEGIN extern "C" {
# define U_CDECL_END }
#else
# define U_CFUNC extern
# define U_CDECL_BEGIN
# define U_CDECL_END
#endif
#ifndef U_ATTRIBUTE_DEPRECATED
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for GCC specific attributes
* @internal
*/
#if U_GCC_MAJOR_MINOR >= 302
# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for Visual C++ specific attributes
* @internal
*/
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
#else
# define U_ATTRIBUTE_DEPRECATED
#endif
#endif
/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
#define U_CAPI U_CFUNC U_EXPORT
/** This is used to declare a function as a stable public ICU C API*/
#define U_STABLE U_CAPI
/** This is used to declare a function as a draft public ICU C API */
#define U_DRAFT U_CAPI
/** This is used to declare a function as a deprecated public ICU C API */
#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
/** This is used to declare a function as an obsolete public ICU C API */
#define U_OBSOLETE U_CAPI
/** This is used to declare a function as an internal ICU C API */
#define U_INTERNAL U_CAPI
/**
* \def U_OVERRIDE
* Defined to the C++11 "override" keyword if available.
* Denotes a class or member which is an override of the base class.
* May result in an error if it applied to something not an override.
* @internal
*/
#ifndef U_OVERRIDE
#define U_OVERRIDE override
#endif
/**
* \def U_FINAL
* Defined to the C++11 "final" keyword if available.
* Denotes a class or member which may not be overridden in subclasses.
* May result in an error if subclasses attempt to override.
* @internal
*/
#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
#define U_FINAL final
#endif
// Before ICU 65, function-like, multi-statement ICU macros were just defined as
// series of statements wrapped in { } blocks and the caller could choose to
// either treat them as if they were actual functions and end the invocation
// with a trailing ; creating an empty statement after the block or else omit
// this trailing ; using the knowledge that the macro would expand to { }.
//
// But doing so doesn't work well with macros that look like functions and
// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
// switches to the standard solution of wrapping such macros in do { } while.
//
// This will however break existing code that depends on being able to invoke
// these macros without a trailing ; so to be able to remain compatible with
// such code the wrapper is itself defined as macros so that it's possible to
// build ICU 65 and later with the old macro behaviour, like this:
//
// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
// runConfigureICU ...
/**
* \def UPRV_BLOCK_MACRO_BEGIN
* Defined as the "do" keyword by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_BEGIN
#define UPRV_BLOCK_MACRO_BEGIN do
#endif
/**
* \def UPRV_BLOCK_MACRO_END
* Defined as "while (FALSE)" by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_END
#define UPRV_BLOCK_MACRO_END while (FALSE)
#endif
/*==========================================================================*/
/* limits for int32_t etc., like in POSIX inttypes.h */
/*==========================================================================*/
#ifndef INT8_MIN
/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MIN ((int8_t)(-128))
#endif
#ifndef INT16_MIN
/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MIN ((int16_t)(-32767-1))
#endif
#ifndef INT32_MIN
/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MIN ((int32_t)(-2147483647-1))
#endif
#ifndef INT8_MAX
/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MAX ((int8_t)(127))
#endif
#ifndef INT16_MAX
/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MAX ((int16_t)(32767))
#endif
#ifndef INT32_MAX
/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MAX ((int32_t)(2147483647))
#endif
#ifndef UINT8_MAX
/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT8_MAX ((uint8_t)(255U))
#endif
#ifndef UINT16_MAX
/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT16_MAX ((uint16_t)(65535U))
#endif
#ifndef UINT32_MAX
/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT32_MAX ((uint32_t)(4294967295U))
#endif
#if defined(U_INT64_T_UNAVAILABLE)
# error int64_t is required for decimal format and rule-based number format.
#else
# ifndef INT64_C
/**
* Provides a platform independent way to specify a signed 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
* @stable ICU 2.8
*/
# define INT64_C(c) c ## LL
# endif
# ifndef UINT64_C
/**
* Provides a platform independent way to specify an unsigned 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
* @stable ICU 2.8
*/
# define UINT64_C(c) c ## ULL
# endif
# ifndef U_INT64_MIN
/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
# endif
# ifndef U_INT64_MAX
/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
# endif
# ifndef U_UINT64_MAX
/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
# endif
#endif
/*==========================================================================*/
/* Boolean data type */
/*==========================================================================*/
/** The ICU boolean type @stable ICU 2.0 */
typedef int8_t UBool;
#ifndef TRUE
/** The TRUE value of a UBool @stable ICU 2.0 */
# define TRUE 1
#endif
#ifndef FALSE
/** The FALSE value of a UBool @stable ICU 2.0 */
# define FALSE 0
#endif
/*==========================================================================*/
/* Unicode data types */
/*==========================================================================*/
/* wchar_t-related definitions -------------------------------------------- */
/*
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
*
* @stable ICU 2.0
*/
/*
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
*
* @stable ICU 2.0
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
# if (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# elif (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif defined __UCS2__
# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# endif
# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
# if (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
# define U_WCHAR_IS_UTF32
# elif U_PLATFORM_HAS_WIN32_API
# define U_WCHAR_IS_UTF16
# endif
#endif
/* UChar and UChar32 definitions -------------------------------------------- */
/** Number of bytes in a UChar. @stable ICU 2.0 */
#define U_SIZEOF_UCHAR 2
/**
* \def U_CHAR16_IS_TYPEDEF
* If 1, then char16_t is a typedef and not a real type (yet)
* @internal
*/
#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
// for AIX, uchar.h needs to be included
# include <uchar.h>
# define U_CHAR16_IS_TYPEDEF 1
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
# define U_CHAR16_IS_TYPEDEF 1
#else
# define U_CHAR16_IS_TYPEDEF 0
#endif
/**
* \var UChar
*
* The base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
* Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
*
* UChar is configurable by defining the macro UCHAR_TYPE
* on the preprocessor or compiler command line:
* -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
* (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
* This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
*
* The default is UChar=char16_t.
*
* C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
*
* In C, char16_t is a simple typedef of uint_least16_t.
* ICU requires uint_least16_t=uint16_t for data memory mapping.
* On macOS, char16_t is not available because the uchar.h standard header is missing.
*
* @stable ICU 4.4
*/
#if 1
// #if 1 is normal. UChar defaults to char16_t in C++.
// For configuration testing of UChar=uint16_t temporarily change this to #if 0.
// The intltest Makefile #defines UCHAR_TYPE=char16_t,
// so we only #define it to uint16_t if it is undefined so far.
#elif !defined(UCHAR_TYPE)
# define UCHAR_TYPE uint16_t
#endif
#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
// Inside the ICU library code, never configurable.
typedef char16_t UChar;
#elif defined(UCHAR_TYPE)
typedef UCHAR_TYPE UChar;
#elif defined(__cplusplus)
typedef char16_t UChar;
#else
typedef uint16_t UChar;
#endif
/**
* \var OldUChar
* Default ICU 58 definition of UChar.
* A base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
*
* Define OldUChar to be wchar_t if that is 16 bits wide.
* If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
*
* This makes the definition of OldUChar platform-dependent
* but allows direct string type compatibility with platforms with
* 16-bit wchar_t types.
*
* This is how UChar was defined in ICU 58, for transition convenience.
* Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
* The current UChar responds to UCHAR_TYPE but OldUChar does not.
*
* @stable ICU 59
*/
#if U_SIZEOF_WCHAR_T==2
typedef wchar_t OldUChar;
#elif defined(__CHAR16_TYPE__)
typedef __CHAR16_TYPE__ OldUChar;
#else
typedef uint16_t OldUChar;
#endif
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer (same as int32_t).
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see U_SENTINEL
* @stable ICU 2.4
*/
typedef int32_t UChar32;
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @return -1
* @see UChar32
* @stable ICU 2.4
*/
#define U_SENTINEL (-1)
#include "unicode/urename.h"
#endif

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,733 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf16.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep09
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 16-bit Unicode handling macros
*
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF16_H__
#define __UTF16_H__
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit alone encode a code point (BMP, not a surrogate)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code unit a surrogate (U+d800..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a trail surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 4.2
*/
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
/**
* Helper constant for U16_GET_SUPPLEMENTARY.
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
/**
* Get the lead surrogate (0xd800..0xdbff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return lead surrogate (U+d800..U+dbff) for supplementary
* @stable ICU 2.4
*/
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
/**
* Get the trail surrogate (0xdc00..0xdfff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return trail surrogate (U+dc00..U+dfff) for supplementary
* @stable ICU 2.4
*/
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/**
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
* @param c 32-bit code point
* @return 1 or 2
* @stable ICU 2.4
*/
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @stable ICU 2.4
*/
#define U16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* The result is undefined if the offset points to a single, unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_GET
* @stable ICU 2.4
*/
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 2.4
*/
#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U16_APPEND
* @stable ICU 2.4
*/
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a surrogate pair is written, checks for sufficient space in the string.
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to TRUE.
*
* @param s const UChar * string buffer
* @param i string offset, must be i<capacity
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U16_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* c>0x10ffff or not enough space */ { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_FWD_1
* @stable ICU 2.4
*/
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @see U16_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_FWD_N
* @stable ICU 2.4
*/
#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U16_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U16_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_START
* @stable ICU 2.4
*/
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i
* @see U16_SET_CP_START_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_PREV
* @stable ICU 2.4
*/
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_BACK_1
* @stable ICU 2.4
*/
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @see U16_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_BACK_N
* @stable ICU 2.4
*/
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start start of string
* @param i string offset, must be start<i
* @param n number of code points to skip
* @see U16_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U16_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, start<=i<=length
* @param length int32_t string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
#endif

View File

@ -0,0 +1,881 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf8.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 8-bit Unicode handling macros
*
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF8_H__
#define __UTF8_H__
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
(U8_IS_LEAD(leadByte) ? \
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
* @internal
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* Lead byte E0..EF bits 3..0 are used as byte index,
* first trail byte bits 7..5 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD3_AND_T1
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* First trail byte bits 7..4 are used as byte index,
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD4_AND_T1
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
/**
* Function for handling "append code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
/**
* Function for handling "previous code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
/**
* Function for handling "skip backward one code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
#define U8_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xd7ff ? 3 : \
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
) \
)
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
* @return 4
* @stable ICU 2.4
*/
#define U8_MAX_LENGTH 4
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_GET
* @stable ICU 2.4
*/
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_unsafe_index=(int32_t)(i); \
U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to a negative value.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_GET_UNSAFE
* @stable ICU 2.4
*/
#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_GET() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_GET
* @stable ICU 51
*/
#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_NEXT
* @stable ICU 2.4
*/
#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_NEXT() if that distinction is important.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_NEXT
* @stable ICU 51
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
/** @internal */
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t = 0; \
if((i)!=(length) && \
/* fetch/validate/assemble all but last trail byte */ \
((c)>=0xe0 ? \
((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
(__t&=0x3f, 1) \
: /* U+10000..U+10FFFF */ \
((c)-=0xf0)<=4 && \
U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
(__t=(s)[i]-0x80)<=0x3f) && \
/* valid second-to-last trail byte */ \
((c)=((c)<<6)|__t, ++(i)!=(length)) \
: /* U+0080..U+07FF */ \
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
/* last trail byte */ \
(__t=(s)[i]-0x80)<=0x3f && \
((c)=((c)<<6)|__t, ++(i), 1)) { \
} else { \
(c)=(sub); /* ill-formed*/ \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const uint8_t * string buffer
* @param i string offset
* @param c code point to append
* @see U8_APPEND
* @stable ICU 2.4
*/
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else { \
if(__uc<=0x7ff) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
} else { \
if(__uc<=0xffff) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
} else { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a non-ASCII code point is written, checks for sufficient space in the string.
* If the code point is not valid or trail bytes do not fit,
* then isError is set to TRUE.
*
* @param s const uint8_t * string buffer
* @param i int32_t string offset, must be i<capacity
* @param capacity int32_t size of the string buffer
* @param c UChar32 code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U8_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_FWD_1
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @see U8_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_FWD_N
* @stable ICU 2.4
*/
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U8_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U8_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_START
* @stable ICU 2.4
*/
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
* @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* If the string ends with a UTF-8 byte sequence that is valid so far
* but incomplete, then reduce the length of the string to end before
* the lead byte of that incomplete sequence.
* For example, if the string ends with E1 80, the length is reduced by 2.
*
* In all other cases (the string ends with a complete sequence, or it is not
* possible for any further trail byte to extend the trailing sequence)
* the length remains unchanged.
*
* Useful for processing text split across multiple buffers
* (save the incomplete sequence for later)
* and for optimizing iteration
* (check for string length only once per character).
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_SET_CP_START(), this macro never reads s[length].
*
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param length int32_t string length (usually start<=length)
* @see U8_SET_CP_START
* @stable ICU 61
*/
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
if((length)>(start)) { \
uint8_t __b1=s[(length)-1]; \
if(U8_IS_SINGLE(__b1)) { \
/* common ASCII character */ \
} else if(U8_IS_LEAD(__b1)) { \
--(length); \
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
uint8_t __b2=s[(length)-2]; \
if(0xe0<=__b2 && __b2<=0xf4) { \
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
(length)-=2; \
} \
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
uint8_t __b3=s[(length)-3]; \
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
(length)-=3; \
} \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_PREV
* @stable ICU 2.4
*/
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
(c)|=(UChar32)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_PREV() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_PREV
* @stable ICU 51
*/
#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_BACK_1
* @stable ICU 2.4
*/
#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[--(i)])) {} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @see U8_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_BACK_N
* @stable ICU 2.4
*/
#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t index of the start of the string
* @param i int32_t string offset, must be start<i
* @param n number of code points to skip
* @see U8_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U8_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
U8_BACK_1_UNSAFE(s, i); \
U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i<=length
* @param length int32_t string length
* @see U8_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0)) { \
U8_BACK_1(s, start, i); \
U8_FWD_1(s, i, length); \
} \
} UPRV_BLOCK_MACRO_END
#endif