Compare commits

..

6 Commits

Author SHA1 Message Date
Peter Slattery 43fb4a757a Implement Tree_Sitter_Language_Definition, handle registering languages by extension, and looking up the appropriate language definition for a buffer.
custom_begin_buffer uses new functions to identify which files to treat as code
implement custom_render_buffer which uses tree sitter data to color tokens
2025-07-10 08:53:54 -07:00
Peter Slattery 7caaed736b Initial tree sitter usage:
- identify buffer language
- custom_begin_buffer sets up necessary tree sitter state, and kicks off a parse task
- custom_end_buffer cleans up tree sitter data and kills async parse tasks
- tree_sitter_parse_async/__inner uses tree sitter api to get a tree of the buffer's code and stores it on the buffers managed scope
- tree_sitter_write_tree prints tree to a special *tree* buffer
- use new build scripts in 4coder project
2025-07-10 07:08:27 -07:00
Peter Slattery 1a97b41257 Add tree sitter to compile command 2025-07-10 06:01:48 -07:00
Peter Slattery 8a918eef82 Checking for color support in build scripts 2025-07-10 06:01:34 -07:00
Peter Slattery d615358064 Updated build scripts to use common include flags 2025-07-08 10:05:45 -07:00
Peter Slattery 0b712c50e9 Added tree-sitter to project and got it building as a static library 2025-07-07 20:26:10 -07:00
68 changed files with 435347 additions and 90 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ distributions/
build_stable/
code/generated
code/custom/generated
*.xcodeproj

View File

@ -26,9 +26,12 @@ SCRIPTS_DIR="$PROJECT_ROOT/build_new/scripts"
HELPERS_DIR="$PROJECT_ROOT/build_new/helpers"
# Include directories
INCLUDES=(
"$CUSTOM_DIR"
"$FOREIGN_DIR/freetype2"
# TODO: this isn't actually being used - it should be
FCODER_INCLUDES=(
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
# "-I$FOREIGN_DIR/freetype2"
"-I$FOREIGN_DIR/tree-sitter/lib/include"
)
# =============================================================================

View File

@ -1,11 +1,34 @@
#!/bin/bash
# Colors & Styles for output
RED='\033[0;31m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m' # No Color
colors_supported() {
# Check if stdout is a terminal
[[ -t 1 ]] || return 1
# Check if TERM is set and not "dumb"
[[ -n "$TERM" && "$TERM" != "dumb" ]] || return 1
# Check if tput is available and supports colors
if command -v tput >/dev/null 2>&1; then
tput setaf 1 >/dev/null 2>&1 || return 1
fi
return 0
}
if colors_supported; then
# Colors & Styles for output
RED='\033[0;31m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m' # No Color
else
RED=''
GREEN=''
BLUE=''
BOLD=''
NC=''
fi
print_success() {
printf "%b✓%b %s\n" "$GREEN" "$NC" "$1"

80
build_new/scripts/build-libs.sh Executable file
View File

@ -0,0 +1,80 @@
#!/bin/bash
# =============================================================================
# Configuration
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CONFIG_DIR="$SCRIPT_DIR/../config"
# Source configuration files
source "$CONFIG_DIR/build-config.sh"
source "$HELPERS_DIR/print-routines.sh"
# =============================================================================
# Build Tree Sitter
# =============================================================================
build_tree_sitter() {
BIN_NAME="custom_4coder"
CUSTOM_ROOT="$FOREIGN_DIR/tree-sitter"
INCLUDES=(
"-I$CUSTOM_ROOT/lib/src"
"-I$CUSTOM_ROOT/lib/include"
)
CLANG_OPTS=(
"-c" # Compile, don't link
"-O2" # Compile in release, regardless of 4coder build mode
"-g" # Debug info
)
TEMP_OUT_DIR=$BUILD_TEMP_DIR/tree-sitter
mkdir -p $TEMP_OUT_DIR
# Build tree-sitter.lib/.a
print_step "Building tree-sitter lib"
clang $CLANG_OPTS "${INCLUDES[@]}" "$CUSTOM_ROOT/lib/src/lib.c" -o $TEMP_OUT_DIR/tree-sitter.o
print_success "Complete"
# Lang: C++ (This needs to be two calls to clang so that you can specify the obj file names)
print_step "Building tree-sitter C++ Language Lib"
clang $CLANG_OPTS "${INCLUDES[@]}" "${CUSTOM_ROOT}/lang/cpp/parser.c" -o $TEMP_OUT_DIR/cpp_parser.o
clang $CLANG_OPTS "${INCLUDES[@]}" "${CUSTOM_ROOT}/lang/cpp/scanner.cc" -o $TEMP_OUT_DIR/cpp_scanner.o
print_success "Complete"
# Lang: C
print_step "Building tree-sitter C Language Lib"
clang $CLANG_OPTS "${INCLUDES[@]}" "${CUSTOM_ROOT}/lang/c/parser.c" -o $TEMP_OUT_DIR/c_parser.o
print_success "Complete"
# Link tree-sitter lib and parser obj files into a static library to link into main custom dll
print_step "Linking tree-sitter static library"
ar rcs $BUILD_TEMP_DIR/tree-sitter.a $TEMP_OUT_DIR/*.o
print_success "Completed"
}
# =============================================================================
# Main
# =============================================================================
main() {
local config="${1:-debug}"
local arch="${2:-x64}"
print_step "macOS Build Process"
print_info "Configuration: $config"
print_info "Architecture: $arch"
# Create build directory
mkdir -p "$BUILD_DIR"
# Execute build steps
build_tree_sitter
}
# Only run main if script is executed directly (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -111,24 +111,18 @@ build_core_engine() {
"-DFRED_INTERNAL"
)
# Include directories for custom layer
local include_flags=(
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
)
print_info "Compiling core application library..."
print_info "Input: $app_target"
print_info "Output: $output_lib"
# Compile core application shared library
if [[ "${BUILD_VERBOSE:-}" == "1" ]]; then
echo "Executing: $CXX ${build_flags[*]} ${include_flags[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
echo "Executing: $CXX ${build_flags[*]} ${FCODER_INCLUDES[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
fi
$CXX \
"${build_flags[@]}" \
"${include_flags[@]}" \
"${FCODER_INCLUDES[@]}" \
-o "$output_lib" \
"$app_target" \
"${LINK_LIBS[@]}"
@ -185,9 +179,7 @@ build_custom_layer() {
local preprocessed_file="$temp_dir/4coder_default_bindings.i"
$CXX \
-I"$CODE_DIR" \
-I"$CUSTOM_DIR" \
-I"$CUSTOM_DIR/generated" \
"${FCODER_INCLUDES[@]}" \
-DMETA_PASS \
-E \
"$custom_target" \
@ -202,9 +194,7 @@ build_custom_layer() {
$CXX \
"${COMPILE_FLAGS[@]}" \
-I"$CODE_DIR" \
-I"$CUSTOM_DIR" \
-I"$CUSTOM_DIR/generated" \
"${FCODER_INCLUDES[@]}" \
-o "$metadata_generator" \
"$metadata_source"
@ -230,8 +220,7 @@ build_custom_layer() {
"${COMPILE_FLAGS[@]}"
"-shared"
"-fPIC"
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
"${FCODER_INCLUDES[@]}"
"-DFRED_SUPER"
"-DFRED_INTERNAL"
)

View File

@ -1,6 +1,4 @@
#!/bin/bash
# build-macos.sh - macOS-specific build logic using clang
# Handles Objective-C++ files, frameworks, etc.
# Usage: ./build-macos.sh [debug|release] [x64|x86|arm64]
set -e # Exit on error
@ -137,24 +135,18 @@ build_core_engine() {
"-DFRED_INTERNAL"
)
# Include directories for custom layer
local include_flags=(
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
)
print_info "Compiling core application library..."
print_info "Input: $app_target"
print_info "Output: $output_lib"
# Compile core application shared library
if [[ "${BUILD_VERBOSE:-}" == "1" ]]; then
echo "Executing: $CXX ${build_flags[*]} ${include_flags[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
echo "Executing: $CXX ${build_flags[*]} ${FCODER_INCLUDES[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
fi
$CXX \
"${build_flags[@]}" \
"${include_flags[@]}" \
"${FCODER_INCLUDES[@]}" \
-o "$output_lib" \
"$app_target" \
"${LINK_LIBS[@]}"
@ -168,15 +160,11 @@ build_platform_layer() {
local platform_source="$CODE_DIR/platform_mac/mac_4ed.mm"
local output_exe="$BUILD_DIR/4ed"
local include_flags=(
"-I$CUSTOM_DIR"
)
# Build flags for executable
local build_flags=(
"${COMPILE_FLAGS[@]}"
"${PLATFORM_INCLUDES[@]}"
"${include_flags[@]}"
"${FCODER_INCLUDES[@]}"
"-DFRED_SUPER"
"-DFRED_INTERNAL"
)
@ -237,10 +225,10 @@ build_custom_layer() {
"${COMPILE_FLAGS[@]}"
"-shared"
"-fPIC"
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
"${FCODER_INCLUDES[@]}"
"-DFRED_SUPER"
"-DFRED_INTERNAL"
"$BUILD_TEMP_DIR/tree-sitter.a"
)
if [[ "${BUILD_VERBOSE:-}" == "1" ]]; then

View File

@ -17,10 +17,10 @@ META_MACROS="-DMETA_PASS"
print_step "Building Metadata"
print_info "Running C Preprocessor"
g++ -I"$CUSTOM_DIR" $META_MACROS $OPTS "$SOURCE" -E -o $PREPROC_FILE
g++ "${FCODER_INCLUDES[@]}" $META_MACROS $OPTS "$SOURCE" -E -o $PREPROC_FILE
print_info "Building Metadata Generator"
g++ -I"$CUSTOM_DIR" $OPTS $METADATA_GEN_SRC -o $METADATA_GEN_DST
g++ "${FCODER_INCLUDES[@]}" $OPTS $METADATA_GEN_SRC -o $METADATA_GEN_DST
print_info "Running Metadata Generator"
$METADATA_GEN_DST -R $CUSTOM_DIR "$PREPROC_FILE"

View File

@ -38,8 +38,7 @@ setup_win32_vars() {
# Base flags for Windows
COMMON_FLAGS=(
"${CLANG_OPTS_WINDOWS[@]}"
"-I$CODE_DIR"
"-I$FOREIGN_DIR/freetype2"
"${FCODER_INCLUDES[@]}"
)
# Architecture-specific flags
@ -161,24 +160,18 @@ build_core_engine() {
"-DFRED_INTERNAL"
)
# Include directories for custom layer
local include_flags=(
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
)
print_info "Compiling core application library..."
print_info "Input: $app_target"
print_info "Output: $output_lib"
# Compile core application shared library
if [[ "${BUILD_VERBOSE:-}" == "1" ]]; then
echo "Executing: $CXX ${build_flags[*]} ${include_flags[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
echo "Executing: $CXX ${build_flags[*]} ${FCODER_INCLUDES[*]} -o $output_lib $app_target ${LINK_LIBS[*]}"
fi
$CXX \
"${build_flags[@]}" \
"${include_flags[@]}" \
"${FCODER_INCLUDES[@]}" \
-o "$output_lib" \
"$app_target" \
"${LINK_LIBS[@]}"
@ -270,8 +263,7 @@ build_custom_layer() {
local build_flags=(
"${COMPILE_FLAGS[@]}"
"-shared"
"-I$CUSTOM_DIR"
"-I$CUSTOM_DIR/generated"
"${FCODER_INCLUDES[@]}"
"-DFRED_SUPER"
"-DFRED_INTERNAL"
)

View File

@ -19,37 +19,8 @@ HELPERS_DIR="$SCRIPT_DIR/../helpers"
# Source configuration files
source "$CONFIG_DIR/build-config.sh"
source "$HELPERS_DIR/print-routines.sh"
# =============================================================================
# Utility Functions
# =============================================================================
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
print_success() {
echo -e "${GREEN}${NC} $1"
}
print_warning() {
echo -e "${YELLOW}${NC} $1"
}
print_error() {
echo -e "${RED}${NC} $1"
}
print_info() {
echo -e "${BLUE}${NC} $1"
}
print_step() {
echo -e "${BLUE}===${NC} $1 ${BLUE}===${NC}"
}
show_usage() {
echo "Usage: $0 [platform] [config] [arch]"

View File

@ -0,0 +1,293 @@
///////////////////////////////////////////////////////////////////////////
// Begin Buffer
///////////////////////////////////////////////////////////////////////////
BUFFER_HOOK_SIG(custom_begin_buffer){
ProfileScope(app, "begin buffer");
Scratch_Block scratch(app);
Tree_Sitter_Language_Definition* language = tree_sitter_language_for_buffer(app, buffer_id);
bool treat_as_code = language != 0;
if (treat_as_code) tree_sitter_begin_buffer(app, buffer_id);
String_ID file_map_id = vars_save_string_lit("keys_file");
String_ID code_map_id = vars_save_string_lit("keys_code");
Command_Map_ID map_id = (treat_as_code)?(code_map_id):(file_map_id);
Managed_Scope scope = buffer_get_managed_scope(app, buffer_id);
Command_Map_ID *map_id_ptr = scope_attachment(app, scope, buffer_map_id, Command_Map_ID);
*map_id_ptr = map_id;
Line_Ending_Kind setting = guess_line_ending_kind_from_buffer(app, buffer_id);
Line_Ending_Kind *eol_setting = scope_attachment(app, scope, buffer_eol_setting, Line_Ending_Kind);
*eol_setting = setting;
// NOTE(allen): Decide buffer settings
b32 wrap_lines = true;
b32 use_lexer = false;
if (treat_as_code){
wrap_lines = def_get_config_b32(vars_save_string_lit("enable_code_wrapping"));
// TODO(PS): @Remove - consider removing the lexer for now? later, replace in favor of tree-sitter
use_lexer = true;
}
if (treat_as_code)
{
Async_Task* parse_task = scope_attachment(app, scope, buffer_tree_sitter_parse_task_id, Async_Task);
*parse_task = async_task_no_dep(&global_async_system, tree_sitter_parse_async, make_data_struct(&buffer_id));
}
String_Const_u8 buffer_name = push_buffer_base_name(app, scratch, buffer_id);
if (buffer_name.size > 0 && buffer_name.str[0] == '*' && buffer_name.str[buffer_name.size - 1] == '*'){
wrap_lines = def_get_config_b32(vars_save_string_lit("enable_output_wrapping"));
}
if (use_lexer){
ProfileBlock(app, "begin buffer kick off lexer");
Async_Task *lex_task_ptr = scope_attachment(app, scope, buffer_lex_task, Async_Task);
*lex_task_ptr = async_task_no_dep(&global_async_system, do_full_lex_async, make_data_struct(&buffer_id));
}
{
b32 *wrap_lines_ptr = scope_attachment(app, scope, buffer_wrap_lines, b32);
*wrap_lines_ptr = wrap_lines;
}
if (use_lexer){
buffer_set_layout(app, buffer_id, layout_virt_indent_index_generic);
}
else{
if (treat_as_code){
buffer_set_layout(app, buffer_id, layout_virt_indent_literal_generic);
}
else{
buffer_set_layout(app, buffer_id, layout_generic);
}
}
// no meaning for return
return(0);
}
///////////////////////////////////////////////////////////////////////////
// End Buffer
///////////////////////////////////////////////////////////////////////////
BUFFER_HOOK_SIG(custom_end_buffer){
Marker_List *list = get_marker_list_for_buffer(buffer_id);
if (list != 0) delete_marker_list(list);
tree_sitter_end_buffer(app, buffer_id);
default_end_buffer(app, buffer_id);
return(0);
}
///////////////////////////////////////////////////////////////////////////
// Render Buffer
///////////////////////////////////////////////////////////////////////////
function void custom_render_buffer(
Application_Links *app,
View_ID view_id,
Face_ID face_id,
Buffer_ID buffer,
Text_Layout_ID text_layout_id,
Rect_f32 rect
){
ProfileScope(app, "render buffer");
Scratch_Block scratch(app);
View_ID active_view = get_active_view(app, Access_Always);
b32 is_active_view = (active_view == view_id);
Rect_f32 prev_clip = draw_set_clip(app, rect);
Range_i64 visible_range = text_layout_get_visible_range(app, text_layout_id);
// NOTE(allen): Cursor shape
Face_Metrics metrics = get_face_metrics(app, face_id);
u64 cursor_roundness_100 = def_get_config_u64(app, vars_save_string_lit("cursor_roundness"));
f32 cursor_roundness = metrics.normal_advance*cursor_roundness_100*0.01f;
f32 mark_thickness = (f32)def_get_config_u64(app, vars_save_string_lit("mark_thickness"));
// NOTE(allen): Token colorizing
Managed_Scope buffer_scope = buffer_get_managed_scope(app, buffer);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, buffer_scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
TSTree* tree = tree_sitter_buffer_get_tree_copy(tree_data);
Token_Array token_array = get_token_array_from_buffer(app, buffer);
paint_text_color_fcolor(app, text_layout_id, visible_range, fcolor_id(defcolor_text_default)); // will get overridden by lang-specific token coloring below
if (token_array.tokens != 0 && tree)
{
if (use_tree_sitter_token_coloring)
{
draw_tree_sitter_node_colors(app, text_layout_id, buffer);
}
else
{
draw_cpp_token_colors(app, text_layout_id, &token_array);
}
}
i64 cursor_pos = view_correct_cursor(app, view_id);
view_correct_mark(app, view_id);
// NOTE(allen): Scope highlight
b32 use_scope_highlight = def_get_config_b32(vars_save_string_lit("use_scope_highlight"));
if (use_scope_highlight){
Color_Array colors = finalize_color_array(defcolor_back_cycle);
draw_scope_highlight(app, buffer, text_layout_id, cursor_pos, colors.vals, colors.count);
}
// NOTE(PS): QOL Column
if (qol_col_cursor.pos >= 0){
Buffer_Seek seek = seek_line_col(qol_col_cursor.line, qol_col_cursor.col);
Buffer_Cursor cursor = buffer_compute_cursor(app, buffer, seek);
Rect_f32 col_rect = text_layout_character_on_screen(app, text_layout_id, cursor.pos);
if (col_rect.x1 > 0.f){
col_rect.y0 = rect.y0;
col_rect.y1 = rect.y1;
draw_rectangle_fcolor(app, col_rect, 0.f, fcolor_id(defcolor_highlight_cursor_line));
}
}
b32 use_error_highlight = def_get_config_b32(vars_save_string_lit("use_error_highlight"));
b32 use_jump_highlight = def_get_config_b32(vars_save_string_lit("use_jump_highlight"));
if (use_error_highlight || use_jump_highlight){
// NOTE(allen): Error highlight
String_Const_u8 name = string_u8_litexpr("*compilation*");
Buffer_ID compilation_buffer = get_buffer_by_name(app, name, Access_Always);
if (use_error_highlight){
draw_jump_highlights(app, buffer, text_layout_id, compilation_buffer,
fcolor_id(defcolor_highlight_junk));
}
// NOTE(allen): Search highlight
if (use_jump_highlight){
Buffer_ID jump_buffer = get_locked_jump_buffer(app);
if (jump_buffer != compilation_buffer){
draw_jump_highlights(app, buffer, text_layout_id, jump_buffer,
fcolor_id(defcolor_highlight_white));
}
}
}
// NOTE(allen): Color parens
b32 use_paren_helper = def_get_config_b32(vars_save_string_lit("use_paren_helper"));
if (use_paren_helper){
Color_Array colors = finalize_color_array(defcolor_text_cycle);
draw_paren_highlight(app, buffer, text_layout_id, cursor_pos, colors.vals, colors.count);
}
// NOTE(allen): Line highlight
b32 highlight_line_at_cursor = def_get_config_b32(vars_save_string_lit("highlight_line_at_cursor"));
if (highlight_line_at_cursor && is_active_view){
i64 line_number = get_line_number_from_pos(app, buffer, cursor_pos);
draw_line_highlight(app, text_layout_id, line_number, fcolor_id(defcolor_highlight_cursor_line));
}
// NOTE(allen): Whitespace highlight
b64 show_whitespace = false;
view_get_setting(app, view_id, ViewSetting_ShowWhitespace, &show_whitespace);
if (show_whitespace){
if (token_array.tokens == 0){
draw_whitespace_highlight(app, buffer, text_layout_id, cursor_roundness);
}
else{
draw_whitespace_highlight(app, text_layout_id, &token_array, cursor_roundness);
}
}
// NOTE(allen): Cursor
switch (fcoder_mode){
case FCoderMode_Original:
{
draw_original_4coder_style_cursor_mark_highlight(app, view_id, is_active_view, buffer, text_layout_id, cursor_roundness, mark_thickness);
}break;
case FCoderMode_NotepadLike:
{
draw_notepad_style_cursor_highlight(app, view_id, buffer, text_layout_id, cursor_roundness);
}break;
}
// NOTE(allen): Fade ranges
paint_fade_ranges(app, text_layout_id, buffer);
// NOTE(allen): put the actual text on the actual screen
draw_text_layout_default(app, text_layout_id);
draw_set_clip(app, prev_clip);
}
function void
custom_render_caller(Application_Links *app, Frame_Info frame_info, View_ID view_id){
ProfileScope(app, "default render caller");
View_ID active_view = get_active_view(app, Access_Always);
b32 is_active_view = (active_view == view_id);
Rect_f32 region = draw_background_and_margin(app, view_id, is_active_view);
Rect_f32 prev_clip = draw_set_clip(app, region);
Buffer_ID buffer = view_get_buffer(app, view_id, Access_Always);
Face_ID face_id = get_face_id(app, buffer);
Face_Metrics face_metrics = get_face_metrics(app, face_id);
f32 line_height = face_metrics.line_height;
f32 digit_advance = face_metrics.decimal_digit_advance;
// NOTE(allen): file bar
b64 showing_file_bar = false;
if (view_get_setting(app, view_id, ViewSetting_ShowFileBar, &showing_file_bar) && showing_file_bar){
Rect_f32_Pair pair = layout_file_bar_on_top(region, line_height);
draw_file_bar(app, view_id, buffer, face_id, pair.min);
region = pair.max;
}
Buffer_Scroll scroll = view_get_buffer_scroll(app, view_id);
Buffer_Point_Delta_Result delta = delta_apply(app, view_id,
frame_info.animation_dt, scroll);
if (!block_match_struct(&scroll.position, &delta.point)){
block_copy_struct(&scroll.position, &delta.point);
view_set_buffer_scroll(app, view_id, scroll, SetBufferScroll_NoCursorChange);
}
if (delta.still_animating){
animate_in_n_milliseconds(app, 0);
}
// NOTE(allen): query bars
region = default_draw_query_bars(app, region, view_id, face_id);
// NOTE(allen): FPS hud
if (show_fps_hud){
Rect_f32_Pair pair = layout_fps_hud_on_bottom(region, line_height);
draw_fps_hud(app, frame_info, face_id, pair.max);
region = pair.min;
animate_in_n_milliseconds(app, 1000);
}
// NOTE(allen): layout line numbers
b32 show_line_number_margins = def_get_config_b32(vars_save_string_lit("show_line_number_margins"));
Rect_f32 line_number_rect = {};
if (show_line_number_margins){
Rect_f32_Pair pair = layout_line_number_margin(app, buffer, region, digit_advance);
line_number_rect = pair.min;
region = pair.max;
}
// NOTE(allen): begin buffer render
Buffer_Point buffer_point = scroll.position;
Text_Layout_ID text_layout_id = text_layout_create(app, buffer, region, buffer_point);
// NOTE(allen): draw line numbers
if (show_line_number_margins){
draw_line_number_margin(app, view_id, buffer, face_id, text_layout_id, line_number_rect);
}
// NOTE(allen): draw the buffer
custom_render_buffer(app, view_id, face_id, buffer, text_layout_id, region);
loco_render_buffer(app, view_id, face_id, buffer, text_layout_id, region, frame_info);
text_layout_free(app, text_layout_id);
draw_set_clip(app, prev_clip);
}

View File

@ -527,6 +527,10 @@ custom_layer_init(Application_Links *app){
set_all_default_hooks(app);
modal_init(3, tctx);
set_custom_hook(app, HookID_BeginBuffer, custom_begin_buffer);
set_custom_hook(app, HookID_EndBuffer, custom_end_buffer);
set_custom_hook(app, HookID_RenderCaller, custom_render_caller);
custom_keyboard_bindings();
#if 0
@ -542,6 +546,7 @@ custom_layer_init(Application_Links *app){
setup_essential_mapping(&framework_mapping, global_map_id, file_map_id, code_map_id);
#endif
tree_sitter_init(app);
}
#endif //FCODER_DEFAULT_BINDINGS

View File

@ -208,7 +208,9 @@ reload_clean_buffers_on_filesystem_change(Application_Links *app, Frame_Info fra
function void
default_tick(Application_Links *app, Frame_Info frame_info){
code_index_update_tick(app);
if (use_tree_sitter_code_indexing) { tree_sitter_code_index_update_tick(app); }
else { code_index_update_tick(app); }
if (tick_all_fade_ranges(app, frame_info.animation_dt)){
animate_in_n_milliseconds(app, 0);

View File

@ -65,6 +65,7 @@
#include "4coder_search_list.h"
#include "4coder_modal.h"
#include "4coder_qol.h"
#include "4coder_tree_sitter.h"
////////////////////////////////
@ -143,10 +144,12 @@
#include "4coder_search_list.cpp"
#include "4coder_modal.cpp"
#include "4coder_yeet.cpp"
#include "4coder_tree_sitter.cpp"
#include "4coder_examples.cpp"
#include "4coder_default_hooks.cpp"
#include "4coder_custom_hooks.cpp"
#include "4coder_qol.cpp"

View File

@ -0,0 +1,449 @@
/////////////////////////////////////////////
// TEMP until I implement more generic language stuff
/////////////////////////////////////////////
TSQuery* tree_sitter_cpp_index_query;
String_Const_u8 TS_CPP_INDEX_QUERY = string_u8_litexpr("(_ \"{\" @Start \"}\" @End ) @ScopeNest\n");
String_Const_u8 TS_CPP_HIGHLIGHT_QUERY = string_u8_litexpr("(call_expression function: ["
" (identifier) @defcolor_function"
" (field_expression field: (field_identifier) @defcolor_function)])"
"(function_declarator"
" declarator: [(identifier) (field_identifier)] @defcolor_function)"
"(preproc_def"
" name: (identifier) @defcolor_macro)"
"(preproc_function_def"
" name: (identifier) @defcolor_macro)"
"(type_identifier) @defcolor_type"
"(call_expression"
" function: (parenthesized_expression"
" (identifier) @defcolor_type))"
"[(primitive_type) (type_qualifier) (storage_class_specifier)"
" (break_statement) (continue_statement) \"union\" \"return\" \"do\""
" \"while\" \"for\" \"if\" \"class\" \"struct\" \"enum\" \"sizeof\""
" \"else\" \"switch\" \"case\"] @defcolor_keyword"
"[(number_literal) (string_literal)] @defcolor_str_constant"
"[(preproc_directive) \"#define\" \"#if\" \"#elif\" \"#else\" \"#endif\""
" \"#include\"] @defcolor_preproc"
"[\"{\" \"}\" \";\" \":\" \",\"] @defcolor_text_default"
"(comment) @defcolor_comment");
////////////////////////////////////////////////////////////////////
// Language Management
////////////////////////////////////////////////////////////////////
function void
tree_sitter_register_language(String_Const_u8 ext, TSLanguage* language, TSQuery* highlight_query)
{
Tree_Sitter_Language_Definition* lang = 0;
u64 hash = table_hash_u8(ext.str, ext.size);
u64 slot = hash % ArrayCount(tree_sitter_languages.languages);
for (Tree_Sitter_Language_Definition* l = tree_sitter_languages.languages[slot]; l != 0; l = l->next)
{
if (l->extension_hash == hash && string_match(l->extension, ext))
{
lang = l; break;
}
}
if (lang == 0)
{
lang = push_array(&tree_sitter_languages.arena, Tree_Sitter_Language_Definition, 1);
lang->next = tree_sitter_languages.languages[slot];
tree_sitter_languages.languages[slot] = lang;
lang->extension_hash = hash;
lang->extension = push_string_copy(&tree_sitter_languages.arena, ext);
lang->language = language;
lang->highlight_query = highlight_query;
}
}
function Tree_Sitter_Language_Definition*
tree_sitter_language_from_file_extension(String_Const_u8 ext)
{
Tree_Sitter_Language_Definition* result = 0;
u64 ext_hash = table_hash_u8(ext.str, ext.size);
u64 slot = ext_hash % ArrayCount(tree_sitter_languages.languages);
for (Tree_Sitter_Language_Definition* l = tree_sitter_languages.languages[slot]; l != 0; l = l->next)
{
if (l->extension_hash == ext_hash && string_match(l->extension, ext))
{
result = l;
break;
}
}
return result;
}
function Tree_Sitter_Language_Definition*
tree_sitter_language_for_buffer(Application_Links* app, Buffer_ID buffer_id, Arena* arena)
{
Tree_Sitter_Language_Definition* result = 0;
String_Const_u8 file_name = push_buffer_file_name(app, arena, buffer_id);
String_Const_u8 extension = string_file_extension(file_name);
result = tree_sitter_language_from_file_extension(extension);
return result;
}
function Tree_Sitter_Language_Definition*
tree_sitter_language_for_buffer(Application_Links* app, Buffer_ID buffer_id)
{
Scratch_Block scratch(app);
return tree_sitter_language_for_buffer(app, buffer_id, scratch);
}
/////////////////////////////////////////////
// Tree Sitter Hook Internals
/////////////////////////////////////////////
function void
tree_sitter_init(Application_Links* app)
{
Buffer_ID buffer = create_buffer(
app,
string_u8_litexpr("*tree*"),
BufferCreate_NeverAttachToFile | BufferCreate_AlwaysNew
);
buffer_set_setting(app, buffer, BufferSetting_Unimportant, true);
buffer_set_setting(app, buffer, BufferSetting_ReadOnly, true);
tree_sitter_languages.arena = make_arena_system(KB(16));
u32 error_offset;
TSQueryError query_error;
{ // Register CPP
TSLanguage* language = tree_sitter_cpp();
String_Const_u8 highlight_query_str = TS_CPP_HIGHLIGHT_QUERY;
TSQuery* highlight_query = ts_query_new(
language,
(const char *)TS_CPP_HIGHLIGHT_QUERY.str,
(u32)TS_CPP_HIGHLIGHT_QUERY.size,
&error_offset, &query_error
);
tree_sitter_register_language(SCu8("cpp"), language, highlight_query);
tree_sitter_register_language(SCu8("h"), language, highlight_query);
tree_sitter_register_language(SCu8("hpp"), language, highlight_query);
tree_sitter_register_language(SCu8("cc"), language, highlight_query);
}
}
function void
tree_sitter_begin_buffer(Application_Links* app, Buffer_ID buffer_id)
{
Managed_Scope buffer_scope = buffer_get_managed_scope(app, buffer_id);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, buffer_scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
tree_data->tree_mutex = system_mutex_make();
}
function void
tree_sitter_end_buffer(Application_Links* app, Buffer_ID buffer_id)
{
Managed_Scope buffer_scope = buffer_get_managed_scope(app, buffer_id);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, buffer_scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
if (!tree_data) return;
Async_Task *tree_sitter_parse_task = scope_attachment(app, buffer_scope, buffer_tree_sitter_parse_task_id, Async_Task);
if (async_task_is_running_or_pending(&global_async_system, *tree_sitter_parse_task))
{
async_task_cancel(app, &global_async_system, *tree_sitter_parse_task);
}
system_mutex_acquire(tree_data->tree_mutex);
ts_tree_delete(tree_data->tree);
system_mutex_release(tree_data->tree_mutex);
system_mutex_free(tree_data->tree_mutex);
}
function TSTree*
tree_sitter_buffer_get_tree_copy(Buffer_Tree_Sitter_Data* tree_data)
{
TSTree* result = 0;
// system_mutex_acquire(tree_data->tree_mutex);
if (tree_data->tree) result = ts_tree_copy(tree_data->tree);
// system_mutex_release(tree_data->tree_mutex);
return result;
}
function void
tree_sitter_parse_async__inner(Async_Context* actx, Buffer_ID buffer_id)
{
Application_Links *app = actx->app;
Arena arena = make_arena_system(KB(16));
TSParser *parser = ts_parser_new();
ts_parser_set_timeout_micros(parser, 5000);
acquire_global_frame_mutex(app);
Tree_Sitter_Language_Definition* lang = tree_sitter_language_for_buffer(app, buffer_id);
String_Const_u8 src = push_whole_buffer(app, &arena, buffer_id);
Managed_Scope scope = buffer_get_managed_scope(app, buffer_id);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
TSTree *old_tree = tree_sitter_buffer_get_tree_copy(tree_data);
bool lang_set = ts_parser_set_language(parser, lang->language);
release_global_frame_mutex(app);
if (!lang_set)
{
AssertMessageAlways("Failed to set the language for the parser."
"This probably means a language wasn't set"
"in the BeginBuffer hook.\n");
}
// Iterate until we get a tree or we find that we should cancel the parse
TSTree *new_tree = 0;
b32 canceled = false;
for (;;)
{
new_tree = ts_parser_parse_string(parser, old_tree, (char *)src.str, (u32)src.size);
if (async_check_canceled(actx))
{
canceled = true;
break;
}
if (new_tree) break;
}
if (!canceled && new_tree)
{
TSTree* old_buffer_tree;
acquire_global_frame_mutex(app);
{
// NOTE(jack): Copy the old pointer to delete it outside the mutex.
system_mutex_acquire(tree_data->tree_mutex);
old_buffer_tree = tree_data->tree;
tree_data->tree = new_tree;
system_mutex_acquire(tree_data->tree_mutex);
print_message(app, SCu8("Finished Parse\n"));
// TODO(PS): Just put the code index update call here
// NOTE(jack): This feels kinda hacky, this is here to trigger
// the code index update tick. The buffer is also makred by the
// async lexer so we will update the index too frequently. We
// should probably change the lexer to not mark as modified.
// TODO(jack): Should we instead trigger another async task here to
// update the code index once this is done?
buffer_mark_as_modified(buffer_id);
// Force a frame refresh by requesting another frame
animate_in_n_milliseconds(app, 0);
}
release_global_frame_mutex(app);
ts_tree_delete(old_buffer_tree);
}
ts_parser_delete(parser);
ts_tree_delete(old_tree);
linalloc_clear(&arena);
}
function void
tree_sitter_parse_async(Async_Context* actx, String_Const_u8 data)
{
if (data.size != sizeof(Buffer_ID)) return;
Buffer_ID buffer_id = *(Buffer_ID*)data.str;
tree_sitter_parse_async__inner(actx, buffer_id);
}
function Range_i64
tree_sitter_node_to_range(TSNode node)
{
Range_i64 result;
result.start = ts_node_start_byte(node);
result.end = ts_node_end_byte(node);
return result;
}
function void
tree_sitter_code_index_update_tick(Application_Links* app)
{
Scratch_Block scratch(app);
#if 0
// TODO(PS): this should be done when we register the language
if (!tree_sitter_cpp_index_query)
{
u32 error_offset;
TSQueryError query_error;
tree_sitter_cpp_index_query = ts_query_new(
tree_sitter_cpp(), TS_CPP_INDEX_QUERY, (u32)TS_CPP_INDEX_QUERY.size, &error_offset, &query_error
);
if (!tree_sitter_cpp_index_query)
{
print_message(app, string_u8_litexpr("Failed to create cpp index query\n");
}
}
for (Buffer_Modified_Node* modified_node = global_buffer_modified_set.first;
modified_node != 0;
modified_node = modified_node->next
){
Temp_Memory_Block temp(scratch);
Buffer_ID buffer_id = modified_node->buffer;
Arena arena = make_arena_system(KB(16));
Code_Index_File* index = push_array_zero(&arena, Code_Index_File, 1);
index->buffer = buffer_id;
Tree_Sitter_Code_Index_Nest_List nests = {};
Managed_Scope buffer_scope = buffer_get_managed_scope(app, buffer_id);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, buffer_scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
TSTree* tree = tree_sitter_buffer_get_tree_copy(tree_data);
if (tree)
{
TSQueryCursor* query_cursor = ts_query_cursor_new();
ts_query_cursor_exec(query_cursor, tree_sitter_cpp_index_query, ts_tree_root_node(tree));
TSQueryMatch query_match;
while (ts_query_cursor_next_match(query_cursor, &match))
{
TSQueryCapture type_capture = match.captures[0];
TSNode type_node = type_capture.node;
Range_i64 type_range = tree_sitter_node_to_range(type_node);
// TODO(PS): PICK UP HERE
}
}
ts_tree_delete(tree);
}
#endif
}
////////////////////////////////////////////////////////////////////
// Token Highlighting
////////////////////////////////////////////////////////////////////
function void
tree_sitter_highlight_node(
Application_Links* app,
TSQuery* query,
TSNode top_node,
TSQueryCursor* query_cursor,
Text_Layout_ID text_layout_id
){
ts_query_cursor_exec(query_cursor, query, top_node);
TSQueryMatch query_match;
u32 capture_index;
while (ts_query_cursor_next_capture(query_cursor, &query_match, &capture_index))
{
TSQueryCapture capture = query_match.captures[capture_index];
TSNode node = capture.node;
u32 length;
const char* tmp = ts_query_capture_name_for_id(query, capture.index, &length);
String_Const_u8 capture_name = SCu8((char*)tmp, length);
Range_i64 highlight_range = tree_sitter_node_to_range(node);
Managed_ID color_id = managed_id_get(app, SCu8("colors"), capture_name);
if (color_id != 0)
{
paint_text_color_fcolor(app, text_layout_id, highlight_range, fcolor_id(color_id));
}
}
}
function void
draw_tree_sitter_node_colors(Application_Links* app, Text_Layout_ID text_layout_id, Buffer_ID buffer_id)
{
Tree_Sitter_Language_Definition* lang = tree_sitter_language_for_buffer(app, buffer_id);
TSQuery* query = lang->highlight_query;
Range_i64 visible_range = text_layout_get_visible_range(app, text_layout_id);
Managed_Scope buffer_scope = buffer_get_managed_scope(app, buffer_id);
Buffer_Tree_Sitter_Data* tree_data = scope_attachment(app, buffer_scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
TSTree* tree = tree_sitter_buffer_get_tree_copy(tree_data);
if (tree)
{
TSNode root = ts_tree_root_node(tree);
// Get the smallest node that fully contains the visible range
TSNode visible_container = ts_node_descendant_for_byte_range(root, (u32)visible_range.start, (u32)visible_range.end);
TSQueryCursor* query_cursor = ts_query_cursor_new();
if (ts_node_eq(root, visible_container))
{
TSTreeCursor tree_cursor = ts_tree_cursor_new(visible_container);
if (ts_tree_cursor_goto_first_child_for_byte(&tree_cursor, (u32)visible_range.start) != -1)
{
do {
TSNode node = ts_tree_cursor_current_node(&tree_cursor);
Range_i64 child_range = tree_sitter_node_to_range(node);
if (child_range.start > visible_range.end) break;
tree_sitter_highlight_node(app, query, node, query_cursor, text_layout_id);
} while(ts_tree_cursor_goto_next_sibling(&tree_cursor));
}
else
{
// Pathological case - just highligh the whole document. This is probably bad
tree_sitter_highlight_node(app, query, root, query_cursor, text_layout_id);
}
}
else
{
tree_sitter_highlight_node(app, query, visible_container, query_cursor, text_layout_id);
}
ts_query_cursor_delete(query_cursor);
}
ts_tree_delete(tree);
}
////////////////////////////////////////////////////////////////////
// DEBUG
////////////////////////////////////////////////////////////////////
char* prefix_buffer = " ";
function void
write_tree_sitter_tree_to_buffer__inner(Application_Links *app, Arena *arena, Buffer_ID buffer_id,
TSNode cur_node, i32 level = 0, const char *field="")
{
TSPoint start = ts_node_start_point(cur_node);
TSPoint end = ts_node_end_point(cur_node);
// + 1 on ts positions becuase the first line/column are zero in treesitter,
// but 4coder displays as 1 indexed in the filebar.
String_Const_u8 string = push_stringf(arena, "%.*s%s: %s [%d, %d] - [%d, %d]\n",
level*2, prefix_buffer, field, ts_node_type(cur_node),
start.row + 1, start.column + 1,
end.row + 1, end.column + 1);
buffer_replace_range(app, buffer_id, Ii64(buffer_get_size(app, buffer_id)), string);
u32 child_count = ts_node_child_count(cur_node);
for (u32 i = 0; i < child_count; ++i)
{
TSNode child = ts_node_child(cur_node, i);
if (ts_node_is_named(child))
{
field = ts_node_field_name_for_child(cur_node, i);
if (!field) field = "";
write_tree_sitter_tree_to_buffer__inner(app, arena, buffer_id, child, level + 1, field);
}
}
}
CUSTOM_COMMAND_SIG(tree_sitter_write_tree)
CUSTOM_DOC("Write the current buffer's tree sitter tree to *tree*")
{
Scratch_Block scratch(app);
Buffer_ID out_buffer = get_buffer_by_name(app, string_u8_litexpr("*tree*"), Access_Always);
View_ID view = get_active_view(app, Access_Always);
Buffer_ID buffer = view_get_buffer(app, view, Access_Visible);
Managed_Scope scope = buffer_get_managed_scope(app, buffer);
Buffer_Tree_Sitter_Data *tree_data = scope_attachment(app, scope, buffer_tree_sitter_data_id, Buffer_Tree_Sitter_Data);
if (tree_data->tree)
{
TSNode root = ts_tree_root_node(tree_data->tree);
write_tree_sitter_tree_to_buffer__inner(app, scratch, out_buffer, root);
}
}

View File

@ -0,0 +1,62 @@
/* date = July 8th 2025 10:13 am */
#ifndef FCODER_TREE_SITTER_H
#define FCODER_TREE_SITTER_H
#include <tree_sitter/api.h>
struct Tree_Sitter_Language_Definition
{
String_Const_u8 extension;
u64 extension_hash;
TSLanguage* language;
TSQuery* highlight_query;
Tree_Sitter_Language_Definition* next;
};
struct Tree_Sitter_Languages
{
Arena arena;
Tree_Sitter_Language_Definition* languages[4096];
};
global Tree_Sitter_Languages tree_sitter_languages;
extern "C" {
TSLanguage *tree_sitter_cpp();
TSLanguage *tree_sitter_c();
}
CUSTOM_ID(attachment, buffer_tree_sitter_data_id);
CUSTOM_ID(attachment, buffer_tree_sitter_parse_task_id);
struct Buffer_Tree_Sitter_Data
{
TSTree* tree;
System_Mutex tree_mutex;
};
struct Tree_Sitter_Code_Index_Nest_Node
{
Tree_Sitter_Code_Index_Nest_Node* next;
Tree_Sitter_Code_Index_Nest_Node* prev;
Tree_Sitter_Code_Index_Nest_Node* parent;
Tree_Sitter_Code_Index_Nest_Node* child_first;
Tree_Sitter_Code_Index_Nest_Node* child_last;
Code_Index_Nest* nest;
};
struct Tree_Sitter_Code_Index_Nest_List
{
Tree_Sitter_Code_Index_Nest_Node* first;
Tree_Sitter_Code_Index_Nest_Node* last;
};
b8 use_tree_sitter_code_indexing = true;
b8 use_tree_sitter_token_coloring = true;
function void tree_sitter_code_index_update_tick(Application_Links *app);
#endif //FCODER_TREE_SITTER_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,119 @@
#include <tree_sitter/parser.h>
#include <string>
#include <cwctype>
namespace {
using std::wstring;
using std::iswspace;
enum TokenType {
RAW_STRING_LITERAL,
};
struct Scanner {
bool scan(TSLexer *lexer, const bool *valid_symbols) {
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true);
}
lexer->result_symbol = RAW_STRING_LITERAL;
// Raw string literals can start with: R, LR, uR, UR, u8R
// Consume 'R'
if (lexer->lookahead == 'L' || lexer->lookahead == 'U') {
lexer->advance(lexer, false);
if (lexer->lookahead != 'R') {
return false;
}
} else if (lexer->lookahead == 'u') {
lexer->advance(lexer, false);
if (lexer->lookahead == '8') {
lexer->advance(lexer, false);
if (lexer->lookahead != 'R') {
return false;
}
} else if (lexer->lookahead != 'R') {
return false;
}
} else if (lexer->lookahead != 'R') {
return false;
}
lexer->advance(lexer, false);
// Consume '"'
if (lexer->lookahead != '"') return false;
lexer->advance(lexer, false);
// Consume '(', delimiter
wstring delimiter;
for (;;) {
if (lexer->lookahead == 0 || lexer->lookahead == '\\' || iswspace(lexer->lookahead)) {
return false;
}
if (lexer->lookahead == '(') {
lexer->advance(lexer, false);
break;
}
delimiter += lexer->lookahead;
lexer->advance(lexer, false);
}
// Consume content, delimiter, ')', '"'
int delimiter_index = -1;
for (;;) {
if (lexer->lookahead == 0) return false;
if (delimiter_index >= 0) {
if (static_cast<unsigned>(delimiter_index) == delimiter.size()) {
if (lexer->lookahead == '"') {
lexer->advance(lexer, false);
return true;
} else {
delimiter_index = -1;
}
} else {
if (lexer->lookahead == delimiter[delimiter_index]) {
delimiter_index++;
} else {
delimiter_index = -1;
}
}
}
if (delimiter_index == -1 && lexer->lookahead == ')') {
delimiter_index = 0;
}
lexer->advance(lexer, false);
}
}
};
}
extern "C" {
void *tree_sitter_cpp_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_cpp_external_scanner_serialize(void *payload, char *buffer) {
return 0;
}
void tree_sitter_cpp_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
}
void tree_sitter_cpp_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,224 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef struct {
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
uint32_t state_count;
uint32_t large_state_count;
uint32_t production_id_count;
uint32_t field_count;
uint16_t max_alias_sequence_length;
const uint16_t *parse_table;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char * const *symbol_names;
const char * const *field_names;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
const TSStateId *primary_state_ids;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value \
} \
}}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value, \
.repetition = true \
} \
}}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}}
#define REDUCE(symbol_val, child_count_val, ...) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

View File

@ -0,0 +1,48 @@
#include "alloc.h"
#include "tree_sitter/api.h"
#include <stdlib.h>
static void *ts_malloc_default(size_t size) {
void *result = malloc(size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size);
abort();
}
return result;
}
static void *ts_calloc_default(size_t count, size_t size) {
void *result = calloc(count, size);
if (count > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size);
abort();
}
return result;
}
static void *ts_realloc_default(void *buffer, size_t size) {
void *result = realloc(buffer, size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size);
abort();
}
return result;
}
// Allow clients to override allocation functions dynamically
TS_PUBLIC void *(*ts_current_malloc)(size_t) = ts_malloc_default;
TS_PUBLIC void *(*ts_current_calloc)(size_t, size_t) = ts_calloc_default;
TS_PUBLIC void *(*ts_current_realloc)(void *, size_t) = ts_realloc_default;
TS_PUBLIC void (*ts_current_free)(void *) = free;
void ts_set_allocator(
void *(*new_malloc)(size_t size),
void *(*new_calloc)(size_t count, size_t size),
void *(*new_realloc)(void *ptr, size_t size),
void (*new_free)(void *ptr)
) {
ts_current_malloc = new_malloc ? new_malloc : ts_malloc_default;
ts_current_calloc = new_calloc ? new_calloc : ts_calloc_default;
ts_current_realloc = new_realloc ? new_realloc : ts_realloc_default;
ts_current_free = new_free ? new_free : free;
}

View File

@ -0,0 +1,41 @@
#ifndef TREE_SITTER_ALLOC_H_
#define TREE_SITTER_ALLOC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#if defined(TREE_SITTER_HIDDEN_SYMBOLS) || defined(_WIN32)
#define TS_PUBLIC
#else
#define TS_PUBLIC __attribute__((visibility("default")))
#endif
TS_PUBLIC extern void *(*ts_current_malloc)(size_t size);
TS_PUBLIC extern void *(*ts_current_calloc)(size_t count, size_t size);
TS_PUBLIC extern void *(*ts_current_realloc)(void *ptr, size_t size);
TS_PUBLIC extern void (*ts_current_free)(void *ptr);
// Allow clients to override allocation functions
#ifndef ts_malloc
#define ts_malloc ts_current_malloc
#endif
#ifndef ts_calloc
#define ts_calloc ts_current_calloc
#endif
#ifndef ts_realloc
#define ts_realloc ts_current_realloc
#endif
#ifndef ts_free
#define ts_free ts_current_free
#endif
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_ALLOC_H_

View File

@ -0,0 +1,291 @@
#ifndef TREE_SITTER_ARRAY_H_
#define TREE_SITTER_ARRAY_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./alloc.h"
#include "./ts_assert.h"
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4101)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#define Array(T) \
struct { \
T *contents; \
uint32_t size; \
uint32_t capacity; \
}
/// Initialize an array.
#define array_init(self) \
((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL)
/// Create an empty array.
#define array_new() \
{ NULL, 0, 0 }
/// Get a pointer to the element at a given `index` in the array.
#define array_get(self, _index) \
(ts_assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index])
/// Get a pointer to the first element in the array.
#define array_front(self) array_get(self, 0)
/// Get a pointer to the last element in the array.
#define array_back(self) array_get(self, (self)->size - 1)
/// Clear the array, setting its size to zero. Note that this does not free any
/// memory allocated for the array's contents.
#define array_clear(self) ((self)->size = 0)
/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is
/// less than the array's current capacity, this function has no effect.
#define array_reserve(self, new_capacity) \
_array__reserve((Array *)(self), array_elem_size(self), new_capacity)
/// Free any memory allocated for this array. Note that this does not free any
/// memory allocated for the array's contents.
#define array_delete(self) _array__delete((Array *)(self))
/// Push a new `element` onto the end of the array.
#define array_push(self, element) \
(_array__grow((Array *)(self), 1, array_elem_size(self)), \
(self)->contents[(self)->size++] = (element))
/// Increase the array's size by `count` elements.
/// New elements are zero-initialized.
#define array_grow_by(self, count) \
do { \
if ((count) == 0) break; \
_array__grow((Array *)(self), count, array_elem_size(self)); \
memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)); \
(self)->size += (count); \
} while (0)
/// Append all elements from one array to the end of another.
#define array_push_all(self, other) \
array_extend((self), (other)->size, (other)->contents)
/// Append `count` elements to the end of the array, reading their values from the
/// `contents` pointer.
#define array_extend(self, count, contents) \
_array__splice( \
(Array *)(self), array_elem_size(self), (self)->size, \
0, count, contents \
)
/// Remove `old_count` elements from the array starting at the given `index`. At
/// the same index, insert `new_count` new elements, reading their values from the
/// `new_contents` pointer.
#define array_splice(self, _index, old_count, new_count, new_contents) \
_array__splice( \
(Array *)(self), array_elem_size(self), _index, \
old_count, new_count, new_contents \
)
/// Insert one `element` into the array at the given `index`.
#define array_insert(self, _index, element) \
_array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element))
/// Remove one element from the array at the given `index`.
#define array_erase(self, _index) \
_array__erase((Array *)(self), array_elem_size(self), _index)
/// Pop the last element off the array, returning the element by value.
#define array_pop(self) ((self)->contents[--(self)->size])
/// Assign the contents of one array to another, reallocating if necessary.
#define array_assign(self, other) \
_array__assign((Array *)(self), (const Array *)(other), array_elem_size(self))
/// Swap one array with another
#define array_swap(self, other) \
_array__swap((Array *)(self), (Array *)(other))
/// Get the size of the array contents
#define array_elem_size(self) (sizeof *(self)->contents)
/// Search a sorted array for a given `needle` value, using the given `compare`
/// callback to determine the order.
///
/// If an existing element is found to be equal to `needle`, then the `index`
/// out-parameter is set to the existing value's index, and the `exists`
/// out-parameter is set to true. Otherwise, `index` is set to an index where
/// `needle` should be inserted in order to preserve the sorting, and `exists`
/// is set to false.
#define array_search_sorted_with(self, compare, needle, _index, _exists) \
_array__search_sorted(self, 0, compare, , needle, _index, _exists)
/// Search a sorted array for a given `needle` value, using integer comparisons
/// of a given struct field (specified with a leading dot) to determine the order.
///
/// See also `array_search_sorted_with`.
#define array_search_sorted_by(self, field, needle, _index, _exists) \
_array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists)
/// Insert a given `value` into a sorted array, using the given `compare`
/// callback to determine the order.
#define array_insert_sorted_with(self, compare, value) \
do { \
unsigned _index, _exists; \
array_search_sorted_with(self, compare, &(value), &_index, &_exists); \
if (!_exists) array_insert(self, _index, value); \
} while (0)
/// Insert a given `value` into a sorted array, using integer comparisons of
/// a given struct field (specified with a leading dot) to determine the order.
///
/// See also `array_search_sorted_by`.
#define array_insert_sorted_by(self, field, value) \
do { \
unsigned _index, _exists; \
array_search_sorted_by(self, field, (value) field, &_index, &_exists); \
if (!_exists) array_insert(self, _index, value); \
} while (0)
// Private
typedef Array(void) Array;
/// This is not what you're looking for, see `array_delete`.
static inline void _array__delete(Array *self) {
if (self->contents) {
ts_free(self->contents);
self->contents = NULL;
self->size = 0;
self->capacity = 0;
}
}
/// This is not what you're looking for, see `array_erase`.
static inline void _array__erase(Array *self, size_t element_size,
uint32_t index) {
ts_assert(index < self->size);
char *contents = (char *)self->contents;
memmove(contents + index * element_size, contents + (index + 1) * element_size,
(self->size - index - 1) * element_size);
self->size--;
}
/// This is not what you're looking for, see `array_reserve`.
static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) {
if (new_capacity > self->capacity) {
if (self->contents) {
self->contents = ts_realloc(self->contents, new_capacity * element_size);
} else {
self->contents = ts_malloc(new_capacity * element_size);
}
self->capacity = new_capacity;
}
}
/// This is not what you're looking for, see `array_assign`.
static inline void _array__assign(Array *self, const Array *other, size_t element_size) {
_array__reserve(self, element_size, other->size);
self->size = other->size;
memcpy(self->contents, other->contents, self->size * element_size);
}
/// This is not what you're looking for, see `array_swap`.
static inline void _array__swap(Array *self, Array *other) {
Array swap = *other;
*other = *self;
*self = swap;
}
/// This is not what you're looking for, see `array_push` or `array_grow_by`.
static inline void _array__grow(Array *self, uint32_t count, size_t element_size) {
uint32_t new_size = self->size + count;
if (new_size > self->capacity) {
uint32_t new_capacity = self->capacity * 2;
if (new_capacity < 8) new_capacity = 8;
if (new_capacity < new_size) new_capacity = new_size;
_array__reserve(self, element_size, new_capacity);
}
}
/// This is not what you're looking for, see `array_splice`.
static inline void _array__splice(Array *self, size_t element_size,
uint32_t index, uint32_t old_count,
uint32_t new_count, const void *elements) {
uint32_t new_size = self->size + new_count - old_count;
uint32_t old_end = index + old_count;
uint32_t new_end = index + new_count;
ts_assert(old_end <= self->size);
_array__reserve(self, element_size, new_size);
char *contents = (char *)self->contents;
if (self->size > old_end) {
memmove(
contents + new_end * element_size,
contents + old_end * element_size,
(self->size - old_end) * element_size
);
}
if (new_count > 0) {
if (elements) {
memcpy(
(contents + index * element_size),
elements,
new_count * element_size
);
} else {
memset(
(contents + index * element_size),
0,
new_count * element_size
);
}
}
self->size += new_count - old_count;
}
/// A binary search routine, based on Rust's `std::slice::binary_search_by`.
/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`.
#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \
do { \
*(_index) = start; \
*(_exists) = false; \
uint32_t size = (self)->size - *(_index); \
if (size == 0) break; \
int comparison; \
while (size > 1) { \
uint32_t half_size = size / 2; \
uint32_t mid_index = *(_index) + half_size; \
comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \
if (comparison <= 0) *(_index) = mid_index; \
size -= half_size; \
} \
comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \
if (comparison == 0) *(_exists) = true; \
else if (comparison < 0) *(_index) += 1; \
} while (0)
/// Helper macro for the `_sorted_by` routines below. This takes the left (existing)
/// parameter by reference in order to work with the generic sorting function above.
#define _compare_int(a, b) ((int)*(a) - (int)(b))
#ifdef _MSC_VER
#pragma warning(pop)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_ARRAY_H_

View File

@ -0,0 +1,68 @@
#ifndef TREE_SITTER_ATOMIC_H_
#define TREE_SITTER_ATOMIC_H_
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#ifdef __TINYC__
static inline size_t atomic_load(const volatile size_t *p) {
return *p;
}
static inline uint32_t atomic_inc(volatile uint32_t *p) {
*p += 1;
return *p;
}
static inline uint32_t atomic_dec(volatile uint32_t *p) {
*p-= 1;
return *p;
}
#elif defined(_WIN32)
#include <windows.h>
static inline size_t atomic_load(const volatile size_t *p) {
return *p;
}
static inline uint32_t atomic_inc(volatile uint32_t *p) {
return InterlockedIncrement((long volatile *)p);
}
static inline uint32_t atomic_dec(volatile uint32_t *p) {
return InterlockedDecrement((long volatile *)p);
}
#else
static inline size_t atomic_load(const volatile size_t *p) {
#ifdef __ATOMIC_RELAXED
return __atomic_load_n(p, __ATOMIC_RELAXED);
#else
return __sync_fetch_and_add((volatile size_t *)p, 0);
#endif
}
static inline uint32_t atomic_inc(volatile uint32_t *p) {
#ifdef __ATOMIC_RELAXED
return __atomic_add_fetch(p, 1U, __ATOMIC_SEQ_CST);
#else
return __sync_add_and_fetch(p, 1U);
#endif
}
static inline uint32_t atomic_dec(volatile uint32_t *p) {
#ifdef __ATOMIC_RELAXED
return __atomic_sub_fetch(p, 1U, __ATOMIC_SEQ_CST);
#else
return __sync_sub_and_fetch(p, 1U);
#endif
}
#endif
#endif // TREE_SITTER_ATOMIC_H_

View File

@ -0,0 +1,146 @@
#ifndef TREE_SITTER_CLOCK_H_
#define TREE_SITTER_CLOCK_H_
#include <stdbool.h>
#include <stdint.h>
typedef uint64_t TSDuration;
#ifdef _WIN32
// Windows:
// * Represent a time as a performance counter value.
// * Represent a duration as a number of performance counter ticks.
#include <windows.h>
typedef uint64_t TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return micros * (uint64_t)frequency.QuadPart / 1000000;
}
static inline uint64_t duration_to_micros(TSDuration self) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return self * 1000000 / (uint64_t)frequency.QuadPart;
}
static inline TSClock clock_null(void) {
return 0;
}
static inline TSClock clock_now(void) {
LARGE_INTEGER result;
QueryPerformanceCounter(&result);
return (uint64_t)result.QuadPart;
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
return base + duration;
}
static inline bool clock_is_null(TSClock self) {
return !self;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
return self > other;
}
#elif defined(CLOCK_MONOTONIC)
// POSIX with monotonic clock support (Linux, macOS)
// * Represent a time as a monotonic (seconds, nanoseconds) pair.
// * Represent a duration as a number of microseconds.
//
// On these platforms, parse timeouts will correspond accurately to
// real time, regardless of what other processes are running.
#include <time.h>
typedef struct timespec TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
return micros;
}
static inline uint64_t duration_to_micros(TSDuration self) {
return self;
}
static inline TSClock clock_now(void) {
TSClock result;
clock_gettime(CLOCK_MONOTONIC, &result);
return result;
}
static inline TSClock clock_null(void) {
return (TSClock) {0, 0};
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
TSClock result = base;
result.tv_sec += duration / 1000000;
result.tv_nsec += (duration % 1000000) * 1000;
if (result.tv_nsec >= 1000000000) {
result.tv_nsec -= 1000000000;
++(result.tv_sec);
}
return result;
}
static inline bool clock_is_null(TSClock self) {
return !self.tv_sec && !self.tv_nsec;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
if (self.tv_sec > other.tv_sec) return true;
if (self.tv_sec < other.tv_sec) return false;
return self.tv_nsec > other.tv_nsec;
}
#else
// POSIX without monotonic clock support
// * Represent a time as a process clock value.
// * Represent a duration as a number of process clock ticks.
//
// On these platforms, parse timeouts may be affected by other processes,
// which is not ideal, but is better than using a non-monotonic time API
// like `gettimeofday`.
#include <time.h>
typedef uint64_t TSClock;
static inline TSDuration duration_from_micros(uint64_t micros) {
return micros * (uint64_t)CLOCKS_PER_SEC / 1000000;
}
static inline uint64_t duration_to_micros(TSDuration self) {
return self * 1000000 / (uint64_t)CLOCKS_PER_SEC;
}
static inline TSClock clock_null(void) {
return 0;
}
static inline TSClock clock_now(void) {
return (uint64_t)clock();
}
static inline TSClock clock_after(TSClock base, TSDuration duration) {
return base + duration;
}
static inline bool clock_is_null(TSClock self) {
return !self;
}
static inline bool clock_is_gt(TSClock self, TSClock other) {
return self > other;
}
#endif
#endif // TREE_SITTER_CLOCK_H_

View File

@ -0,0 +1,11 @@
#ifndef TREE_SITTER_ERROR_COSTS_H_
#define TREE_SITTER_ERROR_COSTS_H_
#define ERROR_STATE 0
#define ERROR_COST_PER_RECOVERY 500
#define ERROR_COST_PER_MISSING_TREE 110
#define ERROR_COST_PER_SKIPPED_TREE 100
#define ERROR_COST_PER_SKIPPED_LINE 30
#define ERROR_COST_PER_SKIPPED_CHAR 1
#endif

View File

@ -0,0 +1,523 @@
#include "./get_changed_ranges.h"
#include "./subtree.h"
#include "./language.h"
#include "./error_costs.h"
#include "./tree_cursor.h"
#include "./ts_assert.h"
// #define DEBUG_GET_CHANGED_RANGES
static void ts_range_array_add(
TSRangeArray *self,
Length start,
Length end
) {
if (self->size > 0) {
TSRange *last_range = array_back(self);
if (start.bytes <= last_range->end_byte) {
last_range->end_byte = end.bytes;
last_range->end_point = end.extent;
return;
}
}
if (start.bytes < end.bytes) {
TSRange range = { start.extent, end.extent, start.bytes, end.bytes };
array_push(self, range);
}
}
bool ts_range_array_intersects(
const TSRangeArray *self,
unsigned start_index,
uint32_t start_byte,
uint32_t end_byte
) {
for (unsigned i = start_index; i < self->size; i++) {
TSRange *range = array_get(self, i);
if (range->end_byte > start_byte) {
if (range->start_byte >= end_byte) break;
return true;
}
}
return false;
}
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
) {
unsigned new_index = 0;
unsigned old_index = 0;
Length current_position = length_zero();
bool in_old_range = false;
bool in_new_range = false;
while (old_index < old_range_count || new_index < new_range_count) {
const TSRange *old_range = &old_ranges[old_index];
const TSRange *new_range = &new_ranges[new_index];
Length next_old_position;
if (in_old_range) {
next_old_position = (Length) {old_range->end_byte, old_range->end_point};
} else if (old_index < old_range_count) {
next_old_position = (Length) {old_range->start_byte, old_range->start_point};
} else {
next_old_position = LENGTH_MAX;
}
Length next_new_position;
if (in_new_range) {
next_new_position = (Length) {new_range->end_byte, new_range->end_point};
} else if (new_index < new_range_count) {
next_new_position = (Length) {new_range->start_byte, new_range->start_point};
} else {
next_new_position = LENGTH_MAX;
}
if (next_old_position.bytes < next_new_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_old_position);
}
if (in_old_range) old_index++;
current_position = next_old_position;
in_old_range = !in_old_range;
} else if (next_new_position.bytes < next_old_position.bytes) {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_new_range) new_index++;
current_position = next_new_position;
in_new_range = !in_new_range;
} else {
if (in_old_range != in_new_range) {
ts_range_array_add(differences, current_position, next_new_position);
}
if (in_old_range) old_index++;
if (in_new_range) new_index++;
in_old_range = !in_old_range;
in_new_range = !in_new_range;
current_position = next_new_position;
}
}
}
typedef struct {
TreeCursor cursor;
const TSLanguage *language;
unsigned visible_depth;
bool in_padding;
Subtree prev_external_token;
} Iterator;
static Iterator iterator_new(
TreeCursor *cursor,
const Subtree *tree,
const TSLanguage *language
) {
array_clear(&cursor->stack);
array_push(&cursor->stack, ((TreeCursorEntry) {
.subtree = tree,
.position = length_zero(),
.child_index = 0,
.structural_child_index = 0,
}));
return (Iterator) {
.cursor = *cursor,
.language = language,
.visible_depth = 1,
.in_padding = false,
.prev_external_token = NULL_SUBTREE,
};
}
static bool iterator_done(Iterator *self) {
return self->cursor.stack.size == 0;
}
static Length iterator_start_position(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
if (self->in_padding) {
return entry.position;
} else {
return length_add(entry.position, ts_subtree_padding(*entry.subtree));
}
}
static Length iterator_end_position(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
Length result = length_add(entry.position, ts_subtree_padding(*entry.subtree));
if (self->in_padding) {
return result;
} else {
return length_add(result, ts_subtree_size(*entry.subtree));
}
}
static bool iterator_tree_is_visible(const Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
if (ts_subtree_visible(*entry.subtree)) return true;
if (self->cursor.stack.size > 1) {
Subtree parent = *array_get(&self->cursor.stack, self->cursor.stack.size - 2)->subtree;
return ts_language_alias_at(
self->language,
parent.ptr->production_id,
entry.structural_child_index
) != 0;
}
return false;
}
static void iterator_get_visible_state(
const Iterator *self,
Subtree *tree,
TSSymbol *alias_symbol,
uint32_t *start_byte
) {
uint32_t i = self->cursor.stack.size - 1;
if (self->in_padding) {
if (i == 0) return;
i--;
}
for (; i + 1 > 0; i--) {
TreeCursorEntry entry = *array_get(&self->cursor.stack, i);
if (i > 0) {
const Subtree *parent = array_get(&self->cursor.stack, i - 1)->subtree;
*alias_symbol = ts_language_alias_at(
self->language,
parent->ptr->production_id,
entry.structural_child_index
);
}
if (ts_subtree_visible(*entry.subtree) || *alias_symbol) {
*tree = *entry.subtree;
*start_byte = entry.position.bytes;
break;
}
}
}
static void iterator_ascend(Iterator *self) {
if (iterator_done(self)) return;
if (iterator_tree_is_visible(self) && !self->in_padding) self->visible_depth--;
if (array_back(&self->cursor.stack)->child_index > 0) self->in_padding = false;
self->cursor.stack.size--;
}
static bool iterator_descend(Iterator *self, uint32_t goal_position) {
if (self->in_padding) return false;
bool did_descend = false;
do {
did_descend = false;
TreeCursorEntry entry = *array_back(&self->cursor.stack);
Length position = entry.position;
uint32_t structural_child_index = 0;
for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) {
const Subtree *child = &ts_subtree_children(*entry.subtree)[i];
Length child_left = length_add(position, ts_subtree_padding(*child));
Length child_right = length_add(child_left, ts_subtree_size(*child));
if (child_right.bytes > goal_position) {
array_push(&self->cursor.stack, ((TreeCursorEntry) {
.subtree = child,
.position = position,
.child_index = i,
.structural_child_index = structural_child_index,
}));
if (iterator_tree_is_visible(self)) {
if (child_left.bytes > goal_position) {
self->in_padding = true;
} else {
self->visible_depth++;
}
return true;
}
did_descend = true;
break;
}
position = child_right;
if (!ts_subtree_extra(*child)) structural_child_index++;
Subtree last_external_token = ts_subtree_last_external_token(*child);
if (last_external_token.ptr) {
self->prev_external_token = last_external_token;
}
}
} while (did_descend);
return false;
}
static void iterator_advance(Iterator *self) {
if (self->in_padding) {
self->in_padding = false;
if (iterator_tree_is_visible(self)) {
self->visible_depth++;
} else {
iterator_descend(self, 0);
}
return;
}
for (;;) {
if (iterator_tree_is_visible(self)) self->visible_depth--;
TreeCursorEntry entry = array_pop(&self->cursor.stack);
if (iterator_done(self)) return;
const Subtree *parent = array_back(&self->cursor.stack)->subtree;
uint32_t child_index = entry.child_index + 1;
Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree);
if (last_external_token.ptr) {
self->prev_external_token = last_external_token;
}
if (ts_subtree_child_count(*parent) > child_index) {
Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree));
uint32_t structural_child_index = entry.structural_child_index;
if (!ts_subtree_extra(*entry.subtree)) structural_child_index++;
const Subtree *next_child = &ts_subtree_children(*parent)[child_index];
array_push(&self->cursor.stack, ((TreeCursorEntry) {
.subtree = next_child,
.position = position,
.child_index = child_index,
.structural_child_index = structural_child_index,
}));
if (iterator_tree_is_visible(self)) {
if (ts_subtree_padding(*next_child).bytes > 0) {
self->in_padding = true;
} else {
self->visible_depth++;
}
} else {
iterator_descend(self, 0);
}
break;
}
}
}
typedef enum {
IteratorDiffers,
IteratorMayDiffer,
IteratorMatches,
} IteratorComparison;
static IteratorComparison iterator_compare(
const Iterator *old_iter,
const Iterator *new_iter
) {
Subtree old_tree = NULL_SUBTREE;
Subtree new_tree = NULL_SUBTREE;
uint32_t old_start = 0;
uint32_t new_start = 0;
TSSymbol old_alias_symbol = 0;
TSSymbol new_alias_symbol = 0;
iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start);
iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start);
TSSymbol old_symbol = ts_subtree_symbol(old_tree);
TSSymbol new_symbol = ts_subtree_symbol(new_tree);
if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches;
if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers;
if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers;
uint32_t old_size = ts_subtree_size(old_tree).bytes;
uint32_t new_size = ts_subtree_size(new_tree).bytes;
TSStateId old_state = ts_subtree_parse_state(old_tree);
TSStateId new_state = ts_subtree_parse_state(new_tree);
bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree);
bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree);
uint32_t old_error_cost = ts_subtree_error_cost(old_tree);
uint32_t new_error_cost = ts_subtree_error_cost(new_tree);
if (
old_start != new_start ||
old_symbol == ts_builtin_sym_error ||
old_size != new_size ||
old_state == TS_TREE_STATE_NONE ||
new_state == TS_TREE_STATE_NONE ||
((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) ||
old_error_cost != new_error_cost ||
old_has_external_tokens != new_has_external_tokens ||
ts_subtree_has_changes(old_tree) ||
(
old_has_external_tokens &&
!ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token)
)
) {
return IteratorMayDiffer;
}
return IteratorMatches;
}
#ifdef DEBUG_GET_CHANGED_RANGES
static inline void iterator_print_state(Iterator *self) {
TreeCursorEntry entry = *array_back(&self->cursor.stack);
TSPoint start = iterator_start_position(self).extent;
TSPoint end = iterator_end_position(self).extent;
const char *name = ts_language_symbol_name(self->language, ts_subtree_symbol(*entry.subtree));
printf(
"(%-25s %s\t depth:%u [%u, %u] - [%u, %u])",
name, self->in_padding ? "(p)" : " ",
self->visible_depth,
start.row, start.column,
end.row, end.column
);
}
#endif
unsigned ts_subtree_get_changed_ranges(
const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges
) {
TSRangeArray results = array_new();
Iterator old_iter = iterator_new(cursor1, old_tree, language);
Iterator new_iter = iterator_new(cursor2, new_tree, language);
unsigned included_range_difference_index = 0;
Length position = iterator_start_position(&old_iter);
Length next_position = iterator_start_position(&new_iter);
if (position.bytes < next_position.bytes) {
ts_range_array_add(&results, position, next_position);
position = next_position;
} else if (position.bytes > next_position.bytes) {
ts_range_array_add(&results, next_position, position);
next_position = position;
}
do {
#ifdef DEBUG_GET_CHANGED_RANGES
printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column);
iterator_print_state(&old_iter);
printf("\tvs\t");
iterator_print_state(&new_iter);
puts("");
#endif
// Compare the old and new subtrees.
IteratorComparison comparison = iterator_compare(&old_iter, &new_iter);
// Even if the two subtrees appear to be identical, they could differ
// internally if they contain a range of text that was previously
// excluded from the parse, and is now included, or vice-versa.
if (comparison == IteratorMatches && ts_range_array_intersects(
included_range_differences,
included_range_difference_index,
position.bytes,
iterator_end_position(&old_iter).bytes
)) {
comparison = IteratorMayDiffer;
}
bool is_changed = false;
switch (comparison) {
// If the subtrees are definitely identical, move to the end
// of both subtrees.
case IteratorMatches:
next_position = iterator_end_position(&old_iter);
break;
// If the subtrees might differ internally, descend into both
// subtrees, finding the first child that spans the current position.
case IteratorMayDiffer:
if (iterator_descend(&old_iter, position.bytes)) {
if (!iterator_descend(&new_iter, position.bytes)) {
is_changed = true;
next_position = iterator_end_position(&old_iter);
}
} else if (iterator_descend(&new_iter, position.bytes)) {
is_changed = true;
next_position = iterator_end_position(&new_iter);
} else {
next_position = length_min(
iterator_end_position(&old_iter),
iterator_end_position(&new_iter)
);
}
break;
// If the subtrees are different, record a change and then move
// to the end of both subtrees.
case IteratorDiffers:
is_changed = true;
next_position = length_min(
iterator_end_position(&old_iter),
iterator_end_position(&new_iter)
);
break;
}
// Ensure that both iterators are caught up to the current position.
while (
!iterator_done(&old_iter) &&
iterator_end_position(&old_iter).bytes <= next_position.bytes
) iterator_advance(&old_iter);
while (
!iterator_done(&new_iter) &&
iterator_end_position(&new_iter).bytes <= next_position.bytes
) iterator_advance(&new_iter);
// Ensure that both iterators are at the same depth in the tree.
while (old_iter.visible_depth > new_iter.visible_depth) {
iterator_ascend(&old_iter);
}
while (new_iter.visible_depth > old_iter.visible_depth) {
iterator_ascend(&new_iter);
}
if (is_changed) {
#ifdef DEBUG_GET_CHANGED_RANGES
printf(
" change: [[%u, %u] - [%u, %u]]\n",
position.extent.row + 1, position.extent.column,
next_position.extent.row + 1, next_position.extent.column
);
#endif
ts_range_array_add(&results, position, next_position);
}
position = next_position;
// Keep track of the current position in the included range differences
// array in order to avoid scanning the entire array on each iteration.
while (included_range_difference_index < included_range_differences->size) {
const TSRange *range = array_get(included_range_differences,
included_range_difference_index
);
if (range->end_byte <= position.bytes) {
included_range_difference_index++;
} else {
break;
}
}
} while (!iterator_done(&old_iter) && !iterator_done(&new_iter));
Length old_size = ts_subtree_total_size(*old_tree);
Length new_size = ts_subtree_total_size(*new_tree);
if (old_size.bytes < new_size.bytes) {
ts_range_array_add(&results, old_size, new_size);
} else if (new_size.bytes < old_size.bytes) {
ts_range_array_add(&results, new_size, old_size);
}
*cursor1 = old_iter.cursor;
*cursor2 = new_iter.cursor;
*ranges = results.contents;
return results.size;
}

View File

@ -0,0 +1,36 @@
#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_
#define TREE_SITTER_GET_CHANGED_RANGES_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./tree_cursor.h"
#include "./subtree.h"
typedef Array(TSRange) TSRangeArray;
void ts_range_array_get_changed_ranges(
const TSRange *old_ranges, unsigned old_range_count,
const TSRange *new_ranges, unsigned new_range_count,
TSRangeArray *differences
);
bool ts_range_array_intersects(
const TSRangeArray *self, unsigned start_index,
uint32_t start_byte, uint32_t end_byte
);
unsigned ts_subtree_get_changed_ranges(
const Subtree *old_tree, const Subtree *new_tree,
TreeCursor *cursor1, TreeCursor *cursor2,
const TSLanguage *language,
const TSRangeArray *included_range_differences,
TSRange **ranges
);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_GET_CHANGED_RANGES_H_

View File

@ -0,0 +1,21 @@
// Determine endian and pointer size based on known defines.
// TS_BIG_ENDIAN and TS_PTR_SIZE can be set as -D compiler arguments
// to override this.
#if !defined(TS_BIG_ENDIAN)
#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \
|| (defined( __APPLE_CC__) && (defined(__ppc__) || defined(__ppc64__)))
#define TS_BIG_ENDIAN 1
#else
#define TS_BIG_ENDIAN 0
#endif
#endif
#if !defined(TS_PTR_SIZE)
#if UINTPTR_MAX == 0xFFFFFFFF
#define TS_PTR_SIZE 32
#else
#define TS_PTR_SIZE 64
#endif
#endif

View File

@ -0,0 +1,293 @@
#include "./language.h"
#include "./wasm_store.h"
#include "tree_sitter/api.h"
#include <string.h>
const TSLanguage *ts_language_copy(const TSLanguage *self) {
if (self && ts_language_is_wasm(self)) {
ts_wasm_language_retain(self);
}
return self;
}
void ts_language_delete(const TSLanguage *self) {
if (self && ts_language_is_wasm(self)) {
ts_wasm_language_release(self);
}
}
uint32_t ts_language_symbol_count(const TSLanguage *self) {
return self->symbol_count + self->alias_count;
}
uint32_t ts_language_state_count(const TSLanguage *self) {
return self->state_count;
}
const TSSymbol *ts_language_supertypes(const TSLanguage *self, uint32_t *length) {
if (self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) {
*length = self->supertype_count;
return self->supertype_symbols;
} else {
*length = 0;
return NULL;
}
}
const TSSymbol *ts_language_subtypes(
const TSLanguage *self,
TSSymbol supertype,
uint32_t *length
) {
if (self->abi_version < LANGUAGE_VERSION_WITH_RESERVED_WORDS || !ts_language_symbol_metadata(self, supertype).supertype) {
*length = 0;
return NULL;
}
TSMapSlice slice = self->supertype_map_slices[supertype];
*length = slice.length;
return &self->supertype_map_entries[slice.index];
}
uint32_t ts_language_version(const TSLanguage *self) {
return self->abi_version;
}
uint32_t ts_language_abi_version(const TSLanguage *self) {
return self->abi_version;
}
const TSLanguageMetadata *ts_language_metadata(const TSLanguage *self) {
return self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? &self->metadata : NULL;
}
const char *ts_language_name(const TSLanguage *self) {
return self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? self->name : NULL;
}
uint32_t ts_language_field_count(const TSLanguage *self) {
return self->field_count;
}
void ts_language_table_entry(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol,
TableEntry *result
) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
result->action_count = 0;
result->is_reusable = false;
result->actions = NULL;
} else {
ts_assert(symbol < self->token_count);
uint32_t action_index = ts_language_lookup(self, state, symbol);
const TSParseActionEntry *entry = &self->parse_actions[action_index];
result->action_count = entry->entry.count;
result->is_reusable = entry->entry.reusable;
result->actions = (const TSParseAction *)(entry + 1);
}
}
TSLexerMode ts_language_lex_mode_for_state(
const TSLanguage *self,
TSStateId state
) {
if (self->abi_version < 15) {
TSLexMode mode = ((const TSLexMode *)self->lex_modes)[state];
return (TSLexerMode) {
.lex_state = mode.lex_state,
.external_lex_state = mode.external_lex_state,
.reserved_word_set_id = 0,
};
} else {
return self->lex_modes[state];
}
}
bool ts_language_is_reserved_word(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
TSLexerMode lex_mode = ts_language_lex_mode_for_state(self, state);
if (lex_mode.reserved_word_set_id > 0) {
unsigned start = lex_mode.reserved_word_set_id * self->max_reserved_word_set_size;
unsigned end = start + self->max_reserved_word_set_size;
for (unsigned i = start; i < end; i++) {
if (self->reserved_words[i] == symbol) return true;
if (self->reserved_words[i] == 0) break;
}
}
return false;
}
TSSymbolMetadata ts_language_symbol_metadata(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return (TSSymbolMetadata) {.visible = true, .named = true};
} else if (symbol == ts_builtin_sym_error_repeat) {
return (TSSymbolMetadata) {.visible = false, .named = false};
} else {
return self->symbol_metadata[symbol];
}
}
TSSymbol ts_language_public_symbol(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) return symbol;
return self->public_symbol_map[symbol];
}
TSStateId ts_language_next_state(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
return 0;
} else if (symbol < self->token_count) {
uint32_t count;
const TSParseAction *actions = ts_language_actions(self, state, symbol, &count);
if (count > 0) {
TSParseAction action = actions[count - 1];
if (action.type == TSParseActionTypeShift) {
return action.shift.extra ? state : action.shift.state;
}
}
return 0;
} else {
return ts_language_lookup(self, state, symbol);
}
}
const char *ts_language_symbol_name(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return "ERROR";
} else if (symbol == ts_builtin_sym_error_repeat) {
return "_ERROR";
} else if (symbol < ts_language_symbol_count(self)) {
return self->symbol_names[symbol];
} else {
return NULL;
}
}
TSSymbol ts_language_symbol_for_name(
const TSLanguage *self,
const char *string,
uint32_t length,
bool is_named
) {
if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error;
uint16_t count = (uint16_t)ts_language_symbol_count(self);
for (TSSymbol i = 0; i < count; i++) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i);
if ((!metadata.visible && !metadata.supertype) || metadata.named != is_named) continue;
const char *symbol_name = self->symbol_names[i];
if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
return self->public_symbol_map[i];
}
}
return 0;
}
TSSymbolType ts_language_symbol_type(
const TSLanguage *self,
TSSymbol symbol
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol);
if (metadata.named && metadata.visible) {
return TSSymbolTypeRegular;
} else if (metadata.visible) {
return TSSymbolTypeAnonymous;
} else if (metadata.supertype) {
return TSSymbolTypeSupertype;
} else {
return TSSymbolTypeAuxiliary;
}
}
const char *ts_language_field_name_for_id(
const TSLanguage *self,
TSFieldId id
) {
uint32_t count = ts_language_field_count(self);
if (count && id <= count) {
return self->field_names[id];
} else {
return NULL;
}
}
TSFieldId ts_language_field_id_for_name(
const TSLanguage *self,
const char *name,
uint32_t name_length
) {
uint16_t count = (uint16_t)ts_language_field_count(self);
for (TSSymbol i = 1; i < count + 1; i++) {
switch (strncmp(name, self->field_names[i], name_length)) {
case 0:
if (self->field_names[i][name_length] == 0) return i;
break;
case -1:
return 0;
default:
break;
}
}
return 0;
}
TSLookaheadIterator *ts_lookahead_iterator_new(const TSLanguage *self, TSStateId state) {
if (state >= self->state_count) return NULL;
LookaheadIterator *iterator = ts_malloc(sizeof(LookaheadIterator));
*iterator = ts_language_lookaheads(self, state);
return (TSLookaheadIterator *)iterator;
}
void ts_lookahead_iterator_delete(TSLookaheadIterator *self) {
ts_free(self);
}
bool ts_lookahead_iterator_reset_state(TSLookaheadIterator * self, TSStateId state) {
LookaheadIterator *iterator = (LookaheadIterator *)self;
if (state >= iterator->language->state_count) return false;
*iterator = ts_language_lookaheads(iterator->language, state);
return true;
}
const TSLanguage *ts_lookahead_iterator_language(const TSLookaheadIterator *self) {
const LookaheadIterator *iterator = (const LookaheadIterator *)self;
return iterator->language;
}
bool ts_lookahead_iterator_reset(TSLookaheadIterator *self, const TSLanguage *language, TSStateId state) {
if (state >= language->state_count) return false;
LookaheadIterator *iterator = (LookaheadIterator *)self;
*iterator = ts_language_lookaheads(language, state);
return true;
}
bool ts_lookahead_iterator_next(TSLookaheadIterator *self) {
LookaheadIterator *iterator = (LookaheadIterator *)self;
return ts_lookahead_iterator__next(iterator);
}
TSSymbol ts_lookahead_iterator_current_symbol(const TSLookaheadIterator *self) {
const LookaheadIterator *iterator = (const LookaheadIterator *)self;
return iterator->symbol;
}
const char *ts_lookahead_iterator_current_symbol_name(const TSLookaheadIterator *self) {
const LookaheadIterator *iterator = (const LookaheadIterator *)self;
return ts_language_symbol_name(iterator->language, iterator->symbol);
}

View File

@ -0,0 +1,293 @@
#ifndef TREE_SITTER_LANGUAGE_H_
#define TREE_SITTER_LANGUAGE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./subtree.h"
#include "./parser.h"
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define LANGUAGE_VERSION_WITH_RESERVED_WORDS 15
#define LANGUAGE_VERSION_WITH_PRIMARY_STATES 14
typedef struct {
const TSParseAction *actions;
uint32_t action_count;
bool is_reusable;
} TableEntry;
typedef struct {
const TSLanguage *language;
const uint16_t *data;
const uint16_t *group_end;
TSStateId state;
uint16_t table_value;
uint16_t section_index;
uint16_t group_count;
bool is_small_state;
const TSParseAction *actions;
TSSymbol symbol;
TSStateId next_state;
uint16_t action_count;
} LookaheadIterator;
void ts_language_table_entry(const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result);
TSLexerMode ts_language_lex_mode_for_state(const TSLanguage *self, TSStateId state);
bool ts_language_is_reserved_word(const TSLanguage *self, TSStateId state, TSSymbol symbol);
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *self, TSSymbol symbol);
TSSymbol ts_language_public_symbol(const TSLanguage *self, TSSymbol symbol);
static inline const TSParseAction *ts_language_actions(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol,
uint32_t *count
) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
*count = entry.action_count;
return entry.actions;
}
static inline bool ts_language_has_reduce_action(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
}
// Lookup the table value for a given symbol and state.
//
// For non-terminal symbols, the table value represents a successor state.
// For terminal symbols, it represents an index in the actions table.
// For 'large' parse states, this is a direct lookup. For 'small' parse
// states, this requires searching through the symbol groups to find
// the given symbol.
static inline uint16_t ts_language_lookup(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
if (state >= self->large_state_count) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *data = &self->small_parse_table[index];
uint16_t group_count = *(data++);
for (unsigned i = 0; i < group_count; i++) {
uint16_t section_value = *(data++);
uint16_t symbol_count = *(data++);
for (unsigned j = 0; j < symbol_count; j++) {
if (*(data++) == symbol) return section_value;
}
}
return 0;
} else {
return self->parse_table[state * self->symbol_count + symbol];
}
}
static inline bool ts_language_has_actions(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
return ts_language_lookup(self, state, symbol) != 0;
}
// Iterate over all of the symbols that are valid in the given state.
//
// For 'large' parse states, this just requires iterating through
// all possible symbols and checking the parse table for each one.
// For 'small' parse states, this exploits the structure of the
// table to only visit the valid symbols.
static inline LookaheadIterator ts_language_lookaheads(
const TSLanguage *self,
TSStateId state
) {
bool is_small_state = state >= self->large_state_count;
const uint16_t *data;
const uint16_t *group_end = NULL;
uint16_t group_count = 0;
if (is_small_state) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
data = &self->small_parse_table[index];
group_end = data + 1;
group_count = *data;
} else {
data = &self->parse_table[state * self->symbol_count] - 1;
}
return (LookaheadIterator) {
.language = self,
.data = data,
.group_end = group_end,
.group_count = group_count,
.is_small_state = is_small_state,
.symbol = UINT16_MAX,
.next_state = 0,
};
}
static inline bool ts_lookahead_iterator__next(LookaheadIterator *self) {
// For small parse states, valid symbols are listed explicitly,
// grouped by their value. There's no need to look up the actions
// again until moving to the next group.
if (self->is_small_state) {
self->data++;
if (self->data == self->group_end) {
if (self->group_count == 0) return false;
self->group_count--;
self->table_value = *(self->data++);
unsigned symbol_count = *(self->data++);
self->group_end = self->data + symbol_count;
self->symbol = *self->data;
} else {
self->symbol = *self->data;
return true;
}
}
// For large parse states, iterate through every symbol until one
// is found that has valid actions.
else {
do {
self->data++;
self->symbol++;
if (self->symbol >= self->language->symbol_count) return false;
self->table_value = *self->data;
} while (!self->table_value);
}
// Depending on if the symbols is terminal or non-terminal, the table value either
// represents a list of actions or a successor state.
if (self->symbol < self->language->token_count) {
const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value];
self->action_count = entry->entry.count;
self->actions = (const TSParseAction *)(entry + 1);
self->next_state = 0;
} else {
self->action_count = 0;
self->next_state = self->table_value;
}
return true;
}
// Whether the state is a "primary state". If this returns false, it indicates that there exists
// another state that behaves identically to this one with respect to query analysis.
static inline bool ts_language_state_is_primary(
const TSLanguage *self,
TSStateId state
) {
if (self->abi_version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) {
return state == self->primary_state_ids[state];
} else {
return true;
}
}
static inline const bool *ts_language_enabled_external_tokens(
const TSLanguage *self,
unsigned external_scanner_state
) {
if (external_scanner_state == 0) {
return NULL;
} else {
return self->external_scanner.states + self->external_token_count * external_scanner_state;
}
}
static inline const TSSymbol *ts_language_alias_sequence(
const TSLanguage *self,
uint32_t production_id
) {
return production_id ?
&self->alias_sequences[production_id * self->max_alias_sequence_length] :
NULL;
}
static inline TSSymbol ts_language_alias_at(
const TSLanguage *self,
uint32_t production_id,
uint32_t child_index
) {
return production_id ?
self->alias_sequences[production_id * self->max_alias_sequence_length + child_index] :
0;
}
static inline void ts_language_field_map(
const TSLanguage *self,
uint32_t production_id,
const TSFieldMapEntry **start,
const TSFieldMapEntry **end
) {
if (self->field_count == 0) {
*start = NULL;
*end = NULL;
return;
}
TSMapSlice slice = self->field_map_slices[production_id];
*start = &self->field_map_entries[slice.index];
*end = &self->field_map_entries[slice.index] + slice.length;
}
static inline void ts_language_aliases_for_symbol(
const TSLanguage *self,
TSSymbol original_symbol,
const TSSymbol **start,
const TSSymbol **end
) {
*start = &self->public_symbol_map[original_symbol];
*end = *start + 1;
unsigned idx = 0;
for (;;) {
TSSymbol symbol = self->alias_map[idx++];
if (symbol == 0 || symbol > original_symbol) break;
uint16_t count = self->alias_map[idx++];
if (symbol == original_symbol) {
*start = &self->alias_map[idx];
*end = &self->alias_map[idx + count];
break;
}
idx += count;
}
}
static inline void ts_language_write_symbol_as_dot_string(
const TSLanguage *self,
FILE *f,
TSSymbol symbol
) {
const char *name = ts_language_symbol_name(self, symbol);
for (const char *chr = name; *chr; chr++) {
switch (*chr) {
case '"':
case '\\':
fputc('\\', f);
fputc(*chr, f);
break;
case '\n':
fputs("\\n", f);
break;
case '\t':
fputs("\\t", f);
break;
default:
fputc(*chr, f);
break;
}
}
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_LANGUAGE_H_

View File

@ -0,0 +1,52 @@
#ifndef TREE_SITTER_LENGTH_H_
#define TREE_SITTER_LENGTH_H_
#include <stdlib.h>
#include <stdbool.h>
#include "./point.h"
#include "tree_sitter/api.h"
typedef struct {
uint32_t bytes;
TSPoint extent;
} Length;
static const Length LENGTH_UNDEFINED = {0, {0, 1}};
static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}};
static inline bool length_is_undefined(Length length) {
return length.bytes == 0 && length.extent.column != 0;
}
static inline Length length_min(Length len1, Length len2) {
return (len1.bytes < len2.bytes) ? len1 : len2;
}
static inline Length length_add(Length len1, Length len2) {
Length result;
result.bytes = len1.bytes + len2.bytes;
result.extent = point_add(len1.extent, len2.extent);
return result;
}
static inline Length length_sub(Length len1, Length len2) {
Length result;
result.bytes = (len1.bytes >= len2.bytes) ? len1.bytes - len2.bytes : 0;
result.extent = point_sub(len1.extent, len2.extent);
return result;
}
static inline Length length_zero(void) {
Length result = {0, {0, 0}};
return result;
}
static inline Length length_saturating_sub(Length len1, Length len2) {
if (len1.bytes > len2.bytes) {
return length_sub(len1, len2);
} else {
return length_zero();
}
}
#endif

View File

@ -0,0 +1,483 @@
#include "./length.h"
#include "./lexer.h"
#include "./unicode.h"
#include "tree_sitter/api.h"
#include <stdarg.h>
#include <stdio.h>
#define LOG(message, character) \
if (self->logger.log) { \
snprintf( \
self->debug_buffer, \
TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
32 <= character && character < 127 ? \
message " character:'%c'" : \
message " character:%d", \
character \
); \
self->logger.log( \
self->logger.payload, \
TSLogTypeLex, \
self->debug_buffer \
); \
}
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
static const TSRange DEFAULT_RANGE = {
.start_point = {
.row = 0,
.column = 0,
},
.end_point = {
.row = UINT32_MAX,
.column = UINT32_MAX,
},
.start_byte = 0,
.end_byte = UINT32_MAX
};
/**
* Sets the column data to the given value and marks it valid.
* @param self The lexer state.
* @param val The new value of the column data.
*/
static void ts_lexer__set_column_data(Lexer *self, uint32_t val) {
self->column_data.valid = true;
self->column_data.value = val;
}
/**
* Increments the value of the column data; no-op if invalid.
* @param self The lexer state.
*/
static void ts_lexer__increment_column_data(Lexer *self) {
if (self->column_data.valid) {
self->column_data.value++;
}
}
/**
* Marks the column data as invalid.
* @param self The lexer state.
*/
static void ts_lexer__invalidate_column_data(Lexer *self) {
self->column_data.valid = false;
self->column_data.value = 0;
}
// Check if the lexer has reached EOF. This state is stored
// by setting the lexer's `current_included_range_index` such that
// it has consumed all of its available ranges.
static bool ts_lexer__eof(const TSLexer *_self) {
Lexer *self = (Lexer *)_self;
return self->current_included_range_index == self->included_range_count;
}
// Clear the currently stored chunk of source code, because the lexer's
// position has changed.
static void ts_lexer__clear_chunk(Lexer *self) {
self->chunk = NULL;
self->chunk_size = 0;
self->chunk_start = 0;
}
// Call the lexer's input callback to obtain a new chunk of source code
// for the current position.
static void ts_lexer__get_chunk(Lexer *self) {
self->chunk_start = self->current_position.bytes;
self->chunk = self->input.read(
self->input.payload,
self->current_position.bytes,
self->current_position.extent,
&self->chunk_size
);
if (!self->chunk_size) {
self->current_included_range_index = self->included_range_count;
self->chunk = NULL;
}
}
// Decode the next unicode character in the current chunk of source code.
// This assumes that the lexer has already retrieved a chunk of source
// code that spans the current position.
static void ts_lexer__get_lookahead(Lexer *self) {
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
uint32_t size = self->chunk_size - position_in_chunk;
if (size == 0) {
self->lookahead_size = 1;
self->data.lookahead = '\0';
return;
}
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
DecodeFunction decode =
self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 :
self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le :
self->input.encoding == TSInputEncodingUTF16BE ? ts_decode_utf16_be : self->input.decode;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
// If this chunk ended in the middle of a multi-byte character,
// try again with a fresh chunk.
if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
ts_lexer__get_chunk(self);
chunk = (const uint8_t *)self->chunk;
size = self->chunk_size;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
}
if (self->data.lookahead == TS_DECODE_ERROR) {
self->lookahead_size = 1;
}
}
static void ts_lexer_goto(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) {
ts_lexer__invalidate_column_data(self);
}
self->current_position = position;
// Move to the first valid position at or after the given position.
bool found_included_range = false;
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (
included_range->end_byte > self->current_position.bytes &&
included_range->end_byte > included_range->start_byte
) {
if (included_range->start_byte >= self->current_position.bytes) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
}
self->current_included_range_index = i;
found_included_range = true;
break;
}
}
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
self->current_position.bytes < self->chunk_start ||
self->current_position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
}
/**
* Actually advances the lexer. Does not log anything.
* @param self The lexer state.
* @param skip Whether to mark the consumed codepoint as whitespace.
*/
static void ts_lexer__do_advance(Lexer *self, bool skip) {
if (self->lookahead_size) {
if (self->data.lookahead == '\n') {
self->current_position.extent.row++;
self->current_position.extent.column = 0;
ts_lexer__set_column_data(self, 0);
} else {
bool is_bom = self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK;
if (!is_bom) ts_lexer__increment_column_data(self);
self->current_position.extent.column += self->lookahead_size;
}
self->current_position.bytes += self->lookahead_size;
}
const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
while (
self->current_position.bytes >= current_range->end_byte ||
current_range->end_byte == current_range->start_byte
) {
if (self->current_included_range_index < self->included_range_count) {
self->current_included_range_index++;
}
if (self->current_included_range_index < self->included_range_count) {
current_range++;
self->current_position = (Length) {
current_range->start_byte,
current_range->start_point,
};
} else {
current_range = NULL;
break;
}
}
if (skip) self->token_start_position = self->current_position;
if (current_range) {
if (
self->current_position.bytes < self->chunk_start ||
self->current_position.bytes >= self->chunk_start + self->chunk_size
) {
ts_lexer__get_chunk(self);
}
ts_lexer__get_lookahead(self);
} else {
ts_lexer__clear_chunk(self);
self->data.lookahead = '\0';
self->lookahead_size = 1;
}
}
// Advance to the next character in the source code, retrieving a new
// chunk of source code if needed.
static void ts_lexer__advance(TSLexer *_self, bool skip) {
Lexer *self = (Lexer *)_self;
if (!self->chunk) return;
if (skip) {
LOG("skip", self->data.lookahead)
} else {
LOG("consume", self->data.lookahead)
}
ts_lexer__do_advance(self, skip);
}
// Mark that a token match has completed. This can be called multiple
// times if a longer match is found later.
static void ts_lexer__mark_end(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
if (!ts_lexer__eof(&self->data)) {
// If the lexer is right at the beginning of included range,
// then the token should be considered to end at the *end* of the
// previous included range, rather than here.
TSRange *current_included_range = &self->included_ranges[
self->current_included_range_index
];
if (
self->current_included_range_index > 0 &&
self->current_position.bytes == current_included_range->start_byte
) {
TSRange *previous_included_range = current_included_range - 1;
self->token_end_position = (Length) {
previous_included_range->end_byte,
previous_included_range->end_point,
};
return;
}
}
self->token_end_position = self->current_position;
}
static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
self->did_get_column = true;
if (!self->column_data.valid) {
// Record current position
uint32_t goal_byte = self->current_position.bytes;
// Back up to the beginning of the line
Length start_of_col = {
self->current_position.bytes - self->current_position.extent.column,
{self->current_position.extent.row, 0},
};
ts_lexer_goto(self, start_of_col);
ts_lexer__set_column_data(self, 0);
ts_lexer__get_chunk(self);
if (!ts_lexer__eof(_self)) {
ts_lexer__get_lookahead(self);
// Advance to the recorded position
while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
ts_lexer__do_advance(self, false);
if (ts_lexer__eof(_self)) break;
}
}
}
return self->column_data.value;
}
// Is the lexer at a boundary between two disjoint included ranges of
// source code? This is exposed as an API because some languages' external
// scanners need to perform custom actions at these boundaries.
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
const Lexer *self = (const Lexer *)_self;
if (self->current_included_range_index < self->included_range_count) {
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
return self->current_position.bytes == current_range->start_byte;
} else {
return false;
}
}
static void ts_lexer__log(const TSLexer *_self, const char *fmt, ...) {
Lexer *self = (Lexer *)_self;
va_list args;
va_start(args, fmt);
if (self->logger.log) {
vsnprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, fmt, args);
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer);
}
va_end(args);
}
void ts_lexer_init(Lexer *self) {
*self = (Lexer) {
.data = {
// The lexer's methods are stored as struct fields so that generated
// parsers can call them without needing to be linked against this
// library.
.advance = ts_lexer__advance,
.mark_end = ts_lexer__mark_end,
.get_column = ts_lexer__get_column,
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
.eof = ts_lexer__eof,
.log = ts_lexer__log,
.lookahead = 0,
.result_symbol = 0,
},
.chunk = NULL,
.chunk_size = 0,
.chunk_start = 0,
.current_position = {0, {0, 0}},
.logger = {
.payload = NULL,
.log = NULL
},
.included_ranges = NULL,
.included_range_count = 0,
.current_included_range_index = 0,
.did_get_column = false,
.column_data = {
.valid = false,
.value = 0
}
};
ts_lexer_set_included_ranges(self, NULL, 0);
}
void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
ts_lexer__clear_chunk(self);
ts_lexer_goto(self, self->current_position);
}
// Move the lexer to the given position. This doesn't do any work
// if the parser is already at the given position.
void ts_lexer_reset(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) {
ts_lexer_goto(self, position);
}
}
void ts_lexer_start(Lexer *self) {
self->token_start_position = self->current_position;
self->token_end_position = LENGTH_UNDEFINED;
self->data.result_symbol = 0;
self->did_get_column = false;
if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if (self->current_position.bytes == 0) {
if (self->data.lookahead == BYTE_ORDER_MARK) {
ts_lexer__advance(&self->data, true);
}
ts_lexer__set_column_data(self, 0);
}
}
}
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
if (length_is_undefined(self->token_end_position)) {
ts_lexer__mark_end(&self->data);
}
// If the token ended at an included range boundary, then its end position
// will have been reset to the end of the preceding range. Reset the start
// position to match.
if (self->token_end_position.bytes < self->token_start_position.bytes) {
self->token_start_position = self->token_end_position;
}
uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
// In order to determine that a byte sequence is invalid UTF8 or UTF16,
// the character decoding algorithm may have looked at the following byte.
// Therefore, the next byte *after* the current (invalid) character
// affects the interpretation of the current character.
if (self->data.lookahead == TS_DECODE_ERROR) {
current_lookahead_end_byte += 4; // the maximum number of bytes read to identify an invalid code point
}
if (current_lookahead_end_byte > *lookahead_end_byte) {
*lookahead_end_byte = current_lookahead_end_byte;
}
}
void ts_lexer_mark_end(Lexer *self) {
ts_lexer__mark_end(&self->data);
}
bool ts_lexer_set_included_ranges(
Lexer *self,
const TSRange *ranges,
uint32_t count
) {
if (count == 0 || !ranges) {
ranges = &DEFAULT_RANGE;
count = 1;
} else {
uint32_t previous_byte = 0;
for (unsigned i = 0; i < count; i++) {
const TSRange *range = &ranges[i];
if (
range->start_byte < previous_byte ||
range->end_byte < range->start_byte
) return false;
previous_byte = range->end_byte;
}
}
size_t size = count * sizeof(TSRange);
self->included_ranges = ts_realloc(self->included_ranges, size);
memcpy(self->included_ranges, ranges, size);
self->included_range_count = count;
ts_lexer_goto(self, self->current_position);
return true;
}
TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) {
*count = self->included_range_count;
return self->included_ranges;
}
#undef LOG

View File

@ -0,0 +1,54 @@
#ifndef TREE_SITTER_LEXER_H_
#define TREE_SITTER_LEXER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./length.h"
#include "./subtree.h"
#include "tree_sitter/api.h"
#include "./parser.h"
typedef struct {
uint32_t value;
bool valid;
} ColumnData;
typedef struct {
TSLexer data;
Length current_position;
Length token_start_position;
Length token_end_position;
TSRange *included_ranges;
const char *chunk;
TSInput input;
TSLogger logger;
uint32_t included_range_count;
uint32_t current_included_range_index;
uint32_t chunk_start;
uint32_t chunk_size;
uint32_t lookahead_size;
bool did_get_column;
ColumnData column_data;
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
} Lexer;
void ts_lexer_init(Lexer *self);
void ts_lexer_delete(Lexer *self);
void ts_lexer_set_input(Lexer *self, TSInput input);
void ts_lexer_reset(Lexer *self, Length position);
void ts_lexer_start(Lexer *self);
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte);
void ts_lexer_mark_end(Lexer *self);
bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count);
TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_LEXER_H_

View File

@ -0,0 +1,12 @@
#include "./alloc.c"
#include "./get_changed_ranges.c"
#include "./language.c"
#include "./lexer.c"
#include "./node.c"
#include "./parser.c"
#include "./query.c"
#include "./stack.c"
#include "./subtree.c"
#include "./tree_cursor.c"
#include "./tree.c"
#include "./wasm_store.c"

View File

@ -0,0 +1,875 @@
#include <stdbool.h>
#include "./point.h"
#include "./subtree.h"
#include "./tree.h"
#include "./language.h"
typedef struct {
Subtree parent;
const TSTree *tree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
const TSSymbol *alias_sequence;
} NodeChildIterator;
static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous);
// TSNode - constructors
TSNode ts_node_new(
const TSTree *tree,
const Subtree *subtree,
Length position,
TSSymbol alias
) {
return (TSNode) {
{position.bytes, position.extent.row, position.extent.column, alias},
subtree,
tree,
};
}
static inline TSNode ts_node__null(void) {
return ts_node_new(NULL, NULL, length_zero(), 0);
}
// TSNode - accessors
uint32_t ts_node_start_byte(TSNode self) {
return self.context[0];
}
TSPoint ts_node_start_point(TSNode self) {
return (TSPoint) {self.context[1], self.context[2]};
}
static inline uint32_t ts_node__alias(const TSNode *self) {
return self->context[3];
}
static inline Subtree ts_node__subtree(TSNode self) {
return *(const Subtree *)self.id;
}
// NodeChildIterator
static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) {
Subtree subtree = ts_node__subtree(*node);
if (ts_subtree_child_count(subtree) == 0) {
return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL};
}
const TSSymbol *alias_sequence = ts_language_alias_sequence(
node->tree->language,
subtree.ptr->production_id
);
return (NodeChildIterator) {
.tree = node->tree,
.parent = subtree,
.position = {ts_node_start_byte(*node), ts_node_start_point(*node)},
.child_index = 0,
.structural_child_index = 0,
.alias_sequence = alias_sequence,
};
}
static inline bool ts_node_child_iterator_done(NodeChildIterator *self) {
return self->child_index == self->parent.ptr->child_count;
}
static inline bool ts_node_child_iterator_next(
NodeChildIterator *self,
TSNode *result
) {
if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false;
const Subtree *child = &ts_subtree_children(self->parent)[self->child_index];
TSSymbol alias_symbol = 0;
if (!ts_subtree_extra(*child)) {
if (self->alias_sequence) {
alias_symbol = self->alias_sequence[self->structural_child_index];
}
self->structural_child_index++;
}
if (self->child_index > 0) {
self->position = length_add(self->position, ts_subtree_padding(*child));
}
*result = ts_node_new(
self->tree,
child,
self->position,
alias_symbol
);
self->position = length_add(self->position, ts_subtree_size(*child));
self->child_index++;
return true;
}
// TSNode - private
static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) {
Subtree tree = ts_node__subtree(self);
if (include_anonymous) {
return ts_subtree_visible(tree) || ts_node__alias(&self);
} else {
TSSymbol alias = ts_node__alias(&self);
if (alias) {
return ts_language_symbol_metadata(self.tree->language, alias).named;
} else {
return ts_subtree_visible(tree) && ts_subtree_named(tree);
}
}
}
static inline uint32_t ts_node__relevant_child_count(
TSNode self,
bool include_anonymous
) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
if (include_anonymous) {
return tree.ptr->visible_child_count;
} else {
return tree.ptr->named_child_count;
}
} else {
return 0;
}
}
static inline TSNode ts_node__child(
TSNode self,
uint32_t child_index,
bool include_anonymous
) {
TSNode result = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
uint32_t index = 0;
NodeChildIterator iterator = ts_node_iterate_children(&result);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node__is_relevant(child, include_anonymous)) {
if (index == child_index) {
return child;
}
index++;
} else {
uint32_t grandchild_index = child_index - index;
uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous);
if (grandchild_index < grandchild_count) {
did_descend = true;
result = child;
child_index = grandchild_index;
break;
}
index += grandchild_count;
}
}
}
return ts_node__null();
}
static bool ts_subtree_has_trailing_empty_descendant(
Subtree self,
Subtree other
) {
for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) {
Subtree child = ts_subtree_children(self)[i];
if (ts_subtree_total_bytes(child) > 0) break;
if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) {
return true;
}
}
return false;
}
static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) {
Subtree self_subtree = ts_node__subtree(self);
bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0;
uint32_t target_end_byte = ts_node_end_byte(self);
TSNode node = ts_node_parent(self);
TSNode earlier_node = ts_node__null();
bool earlier_node_is_relevant = false;
while (!ts_node_is_null(node)) {
TSNode earlier_child = ts_node__null();
bool earlier_child_is_relevant = false;
bool found_child_containing_target = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (child.id == self.id) break;
if (iterator.position.bytes > target_end_byte) {
found_child_containing_target = true;
break;
}
if (iterator.position.bytes == target_end_byte &&
(!self_is_empty ||
ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) {
found_child_containing_target = true;
break;
}
if (ts_node__is_relevant(child, include_anonymous)) {
earlier_child = child;
earlier_child_is_relevant = true;
} else if (ts_node__relevant_child_count(child, include_anonymous) > 0) {
earlier_child = child;
earlier_child_is_relevant = false;
}
}
if (found_child_containing_target) {
if (!ts_node_is_null(earlier_child)) {
earlier_node = earlier_child;
earlier_node_is_relevant = earlier_child_is_relevant;
}
node = child;
} else if (earlier_child_is_relevant) {
return earlier_child;
} else if (!ts_node_is_null(earlier_child)) {
node = earlier_child;
} else if (earlier_node_is_relevant) {
return earlier_node;
} else {
node = earlier_node;
earlier_node = ts_node__null();
earlier_node_is_relevant = false;
}
}
return ts_node__null();
}
static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) {
uint32_t target_end_byte = ts_node_end_byte(self);
TSNode node = ts_node_parent(self);
TSNode later_node = ts_node__null();
bool later_node_is_relevant = false;
while (!ts_node_is_null(node)) {
TSNode later_child = ts_node__null();
bool later_child_is_relevant = false;
TSNode child_containing_target = ts_node__null();
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (iterator.position.bytes <= target_end_byte) continue;
uint32_t start_byte = ts_node_start_byte(self);
uint32_t child_start_byte = ts_node_start_byte(child);
bool is_empty = start_byte == target_end_byte;
bool contains_target = is_empty ?
child_start_byte < start_byte :
child_start_byte <= start_byte;
if (contains_target) {
if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) {
child_containing_target = child;
}
} else if (ts_node__is_relevant(child, include_anonymous)) {
later_child = child;
later_child_is_relevant = true;
break;
} else if (ts_node__relevant_child_count(child, include_anonymous) > 0) {
later_child = child;
later_child_is_relevant = false;
break;
}
}
if (!ts_node_is_null(child_containing_target)) {
if (!ts_node_is_null(later_child)) {
later_node = later_child;
later_node_is_relevant = later_child_is_relevant;
}
node = child_containing_target;
} else if (later_child_is_relevant) {
return later_child;
} else if (!ts_node_is_null(later_child)) {
node = later_child;
} else if (later_node_is_relevant) {
return later_node;
} else {
node = later_node;
}
}
return ts_node__null();
}
static inline TSNode ts_node__first_child_for_byte(
TSNode self,
uint32_t goal,
bool include_anonymous
) {
TSNode node = self;
bool did_descend = true;
NodeChildIterator last_iterator;
bool has_last_iterator = false;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
loop:
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node_end_byte(child) > goal) {
if (ts_node__is_relevant(child, include_anonymous)) {
return child;
} else if (ts_node_child_count(child) > 0) {
if (iterator.child_index < ts_subtree_child_count(ts_node__subtree(child))) {
last_iterator = iterator;
has_last_iterator = true;
}
did_descend = true;
node = child;
break;
}
}
}
if (!did_descend && has_last_iterator) {
iterator = last_iterator;
has_last_iterator = false;
goto loop;
}
}
return ts_node__null();
}
static inline TSNode ts_node__descendant_for_byte_range(
TSNode self,
uint32_t range_start,
uint32_t range_end,
bool include_anonymous
) {
if (range_start > range_end) {
return ts_node__null();
}
TSNode node = self;
TSNode last_visible_node = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
uint32_t node_end = iterator.position.bytes;
// The end of this node must extend far enough forward to touch
// the end of the range
if (node_end < range_end) continue;
// ...and exceed the start of the range, unless the node itself is
// empty, in which case it must at least be equal to the start of the range.
bool is_empty = ts_node_start_byte(child) == node_end;
if (is_empty ? node_end < range_start : node_end <= range_start) continue;
// The start of this node must extend far enough backward to
// touch the start of the range.
if (range_start < ts_node_start_byte(child)) break;
node = child;
if (ts_node__is_relevant(node, include_anonymous)) {
last_visible_node = node;
}
did_descend = true;
break;
}
}
return last_visible_node;
}
static inline TSNode ts_node__descendant_for_point_range(
TSNode self,
TSPoint range_start,
TSPoint range_end,
bool include_anonymous
) {
if (point_gt(range_start, range_end)) {
return ts_node__null();
}
TSNode node = self;
TSNode last_visible_node = self;
bool did_descend = true;
while (did_descend) {
did_descend = false;
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&node);
while (ts_node_child_iterator_next(&iterator, &child)) {
TSPoint node_end = iterator.position.extent;
// The end of this node must extend far enough forward to touch
// the end of the range
if (point_lt(node_end, range_end)) continue;
// ...and exceed the start of the range, unless the node itself is
// empty, in which case it must at least be equal to the start of the range.
bool is_empty = point_eq(ts_node_start_point(child), node_end);
if (is_empty ? point_lt(node_end, range_start) : point_lte(node_end, range_start)) {
continue;
}
// The start of this node must extend far enough backward to
// touch the start of the range.
if (point_lt(range_start, ts_node_start_point(child))) break;
node = child;
if (ts_node__is_relevant(node, include_anonymous)) {
last_visible_node = node;
}
did_descend = true;
break;
}
}
return last_visible_node;
}
// TSNode - public
uint32_t ts_node_end_byte(TSNode self) {
return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes;
}
TSPoint ts_node_end_point(TSNode self) {
return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent);
}
TSSymbol ts_node_symbol(TSNode self) {
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_public_symbol(self.tree->language, symbol);
}
const char *ts_node_type(TSNode self) {
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_symbol_name(self.tree->language, symbol);
}
const TSLanguage *ts_node_language(TSNode self) {
return self.tree->language;
}
TSSymbol ts_node_grammar_symbol(TSNode self) {
return ts_subtree_symbol(ts_node__subtree(self));
}
const char *ts_node_grammar_type(TSNode self) {
TSSymbol symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_symbol_name(self.tree->language, symbol);
}
char *ts_node_string(TSNode self) {
TSSymbol alias_symbol = ts_node__alias(&self);
return ts_subtree_string(
ts_node__subtree(self),
alias_symbol,
ts_language_symbol_metadata(self.tree->language, alias_symbol).visible,
self.tree->language,
false
);
}
bool ts_node_eq(TSNode self, TSNode other) {
return self.tree == other.tree && self.id == other.id;
}
bool ts_node_is_null(TSNode self) {
return self.id == 0;
}
bool ts_node_is_extra(TSNode self) {
return ts_subtree_extra(ts_node__subtree(self));
}
bool ts_node_is_named(TSNode self) {
TSSymbol alias = ts_node__alias(&self);
return alias
? ts_language_symbol_metadata(self.tree->language, alias).named
: ts_subtree_named(ts_node__subtree(self));
}
bool ts_node_is_missing(TSNode self) {
return ts_subtree_missing(ts_node__subtree(self));
}
bool ts_node_has_changes(TSNode self) {
return ts_subtree_has_changes(ts_node__subtree(self));
}
bool ts_node_has_error(TSNode self) {
return ts_subtree_error_cost(ts_node__subtree(self)) > 0;
}
bool ts_node_is_error(TSNode self) {
TSSymbol symbol = ts_node_symbol(self);
return symbol == ts_builtin_sym_error;
}
uint32_t ts_node_descendant_count(TSNode self) {
return ts_subtree_visible_descendant_count(ts_node__subtree(self)) + 1;
}
TSStateId ts_node_parse_state(TSNode self) {
return ts_subtree_parse_state(ts_node__subtree(self));
}
TSStateId ts_node_next_parse_state(TSNode self) {
const TSLanguage *language = self.tree->language;
uint16_t state = ts_node_parse_state(self);
if (state == TS_TREE_STATE_NONE) {
return TS_TREE_STATE_NONE;
}
uint16_t symbol = ts_node_grammar_symbol(self);
return ts_language_next_state(language, state, symbol);
}
TSNode ts_node_parent(TSNode self) {
TSNode node = ts_tree_root_node(self.tree);
if (node.id == self.id) return ts_node__null();
while (true) {
TSNode next_node = ts_node_child_with_descendant(node, self);
if (next_node.id == self.id || ts_node_is_null(next_node)) break;
node = next_node;
}
return node;
}
TSNode ts_node_child_with_descendant(TSNode self, TSNode descendant) {
uint32_t start_byte = ts_node_start_byte(descendant);
uint32_t end_byte = ts_node_end_byte(descendant);
bool is_empty = start_byte == end_byte;
do {
NodeChildIterator iter = ts_node_iterate_children(&self);
do {
if (
!ts_node_child_iterator_next(&iter, &self)
|| ts_node_start_byte(self) > start_byte
) {
return ts_node__null();
}
if (self.id == descendant.id) {
return self;
}
// If the descendant is empty, and the end byte is within `self`,
// we check whether `self` contains it or not.
if (is_empty && iter.position.bytes >= end_byte && ts_node_child_count(self) > 0) {
TSNode child = ts_node_child_with_descendant(self, descendant);
// If the child is not null, return self if it's relevant, else return the child
if (!ts_node_is_null(child)) {
return ts_node__is_relevant(self, true) ? self : child;
}
}
} while ((is_empty ? iter.position.bytes <= end_byte : iter.position.bytes < end_byte) || ts_node_child_count(self) == 0);
} while (!ts_node__is_relevant(self, true));
return self;
}
TSNode ts_node_child(TSNode self, uint32_t child_index) {
return ts_node__child(self, child_index, true);
}
TSNode ts_node_named_child(TSNode self, uint32_t child_index) {
return ts_node__child(self, child_index, false);
}
TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) {
recur:
if (!field_id || ts_node_child_count(self) == 0) return ts_node__null();
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self.tree->language,
ts_node__subtree(self).ptr->production_id,
&field_map,
&field_map_end
);
if (field_map == field_map_end) return ts_node__null();
// The field mappings are sorted by their field id. Scan all
// the mappings to find the ones for the given field id.
while (field_map->field_id < field_id) {
field_map++;
if (field_map == field_map_end) return ts_node__null();
}
while (field_map_end[-1].field_id > field_id) {
field_map_end--;
if (field_map == field_map_end) return ts_node__null();
}
TSNode child;
NodeChildIterator iterator = ts_node_iterate_children(&self);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (!ts_subtree_extra(ts_node__subtree(child))) {
uint32_t index = iterator.structural_child_index - 1;
if (index < field_map->child_index) continue;
// Hidden nodes' fields are "inherited" by their visible parent.
if (field_map->inherited) {
// If this is the *last* possible child node for this field,
// then perform a tail call to avoid recursion.
if (field_map + 1 == field_map_end) {
self = child;
goto recur;
}
// Otherwise, descend into this child, but if it doesn't contain
// the field, continue searching subsequent children.
else {
TSNode result = ts_node_child_by_field_id(child, field_id);
if (result.id) return result;
field_map++;
if (field_map == field_map_end) return ts_node__null();
}
}
else if (ts_node__is_relevant(child, true)) {
return child;
}
// If the field refers to a hidden node with visible children,
// return the first visible child.
else if (ts_node_child_count(child) > 0 ) {
return ts_node_child(child, 0);
}
// Otherwise, continue searching subsequent children.
else {
field_map++;
if (field_map == field_map_end) return ts_node__null();
}
}
}
return ts_node__null();
}
static inline const char *ts_node__field_name_from_language(TSNode self, uint32_t structural_child_index) {
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self.tree->language,
ts_node__subtree(self).ptr->production_id,
&field_map,
&field_map_end
);
for (; field_map != field_map_end; field_map++) {
if (!field_map->inherited && field_map->child_index == structural_child_index) {
return self.tree->language->field_names[field_map->field_id];
}
}
return NULL;
}
const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index) {
TSNode result = self;
bool did_descend = true;
const char *inherited_field_name = NULL;
while (did_descend) {
did_descend = false;
TSNode child;
uint32_t index = 0;
NodeChildIterator iterator = ts_node_iterate_children(&result);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node__is_relevant(child, true)) {
if (index == child_index) {
if (ts_node_is_extra(child)) {
return NULL;
}
const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1);
if (field_name) return field_name;
return inherited_field_name;
}
index++;
} else {
uint32_t grandchild_index = child_index - index;
uint32_t grandchild_count = ts_node__relevant_child_count(child, true);
if (grandchild_index < grandchild_count) {
const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1);
if (field_name) inherited_field_name = field_name;
did_descend = true;
result = child;
child_index = grandchild_index;
break;
}
index += grandchild_count;
}
}
}
return NULL;
}
const char *ts_node_field_name_for_named_child(TSNode self, uint32_t named_child_index) {
TSNode result = self;
bool did_descend = true;
const char *inherited_field_name = NULL;
while (did_descend) {
did_descend = false;
TSNode child;
uint32_t index = 0;
NodeChildIterator iterator = ts_node_iterate_children(&result);
while (ts_node_child_iterator_next(&iterator, &child)) {
if (ts_node__is_relevant(child, false)) {
if (index == named_child_index) {
if (ts_node_is_extra(child)) {
return NULL;
}
const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1);
if (field_name) return field_name;
return inherited_field_name;
}
index++;
} else {
uint32_t named_grandchild_index = named_child_index - index;
uint32_t grandchild_count = ts_node__relevant_child_count(child, false);
if (named_grandchild_index < grandchild_count) {
const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1);
if (field_name) inherited_field_name = field_name;
did_descend = true;
result = child;
named_child_index = named_grandchild_index;
break;
}
index += grandchild_count;
}
}
}
return NULL;
}
TSNode ts_node_child_by_field_name(
TSNode self,
const char *name,
uint32_t name_length
) {
TSFieldId field_id = ts_language_field_id_for_name(
self.tree->language,
name,
name_length
);
return ts_node_child_by_field_id(self, field_id);
}
uint32_t ts_node_child_count(TSNode self) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
return tree.ptr->visible_child_count;
} else {
return 0;
}
}
uint32_t ts_node_named_child_count(TSNode self) {
Subtree tree = ts_node__subtree(self);
if (ts_subtree_child_count(tree) > 0) {
return tree.ptr->named_child_count;
} else {
return 0;
}
}
TSNode ts_node_next_sibling(TSNode self) {
return ts_node__next_sibling(self, true);
}
TSNode ts_node_next_named_sibling(TSNode self) {
return ts_node__next_sibling(self, false);
}
TSNode ts_node_prev_sibling(TSNode self) {
return ts_node__prev_sibling(self, true);
}
TSNode ts_node_prev_named_sibling(TSNode self) {
return ts_node__prev_sibling(self, false);
}
TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) {
return ts_node__first_child_for_byte(self, byte, true);
}
TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) {
return ts_node__first_child_for_byte(self, byte, false);
}
TSNode ts_node_descendant_for_byte_range(
TSNode self,
uint32_t start,
uint32_t end
) {
return ts_node__descendant_for_byte_range(self, start, end, true);
}
TSNode ts_node_named_descendant_for_byte_range(
TSNode self,
uint32_t start,
uint32_t end
) {
return ts_node__descendant_for_byte_range(self, start, end, false);
}
TSNode ts_node_descendant_for_point_range(
TSNode self,
TSPoint start,
TSPoint end
) {
return ts_node__descendant_for_point_range(self, start, end, true);
}
TSNode ts_node_named_descendant_for_point_range(
TSNode self,
TSPoint start,
TSPoint end
) {
return ts_node__descendant_for_point_range(self, start, end, false);
}
void ts_node_edit(TSNode *self, const TSInputEdit *edit) {
uint32_t start_byte = ts_node_start_byte(*self);
TSPoint start_point = ts_node_start_point(*self);
if (start_byte >= edit->old_end_byte) {
start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte);
start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point));
} else if (start_byte > edit->start_byte) {
start_byte = edit->new_end_byte;
start_point = edit->new_end_point;
}
self->context[0] = start_byte;
self->context[1] = start_point.row;
self->context[2] = start_point.column;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,286 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSStateId;
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
typedef struct TSLanguageMetadata {
uint8_t major_version;
uint8_t minor_version;
uint8_t patch_version;
} TSLanguageMetadata;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
// Used to index the field and supertype maps.
typedef struct {
uint16_t index;
uint16_t length;
} TSMapSlice;
typedef struct {
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
void (*log)(const TSLexer *, const char *, ...);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
uint16_t reserved_word_set_id;
} TSLexerMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
typedef struct {
int32_t start;
int32_t end;
} TSCharacterRange;
struct TSLanguage {
uint32_t abi_version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
uint32_t state_count;
uint32_t large_state_count;
uint32_t production_id_count;
uint32_t field_count;
uint16_t max_alias_sequence_length;
const uint16_t *parse_table;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char * const *symbol_names;
const char * const *field_names;
const TSMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexerMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
const TSStateId *primary_state_ids;
const char *name;
const TSSymbol *reserved_words;
uint16_t max_reserved_word_set_size;
uint32_t supertype_count;
const TSSymbol *supertype_symbols;
const TSMapSlice *supertype_map_slices;
const TSSymbol *supertype_map_entries;
TSLanguageMetadata metadata;
};
static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
uint32_t index = 0;
uint32_t size = len - index;
while (size > 1) {
uint32_t half_size = size / 2;
uint32_t mid_index = index + half_size;
const TSCharacterRange *range = &ranges[mid_index];
if (lookahead >= range->start && lookahead <= range->end) {
return true;
} else if (lookahead > range->end) {
index = mid_index;
}
size -= half_size;
}
const TSCharacterRange *range = &ranges[index];
return (lookahead >= range->start && lookahead <= range->end);
}
/*
* Lexer Macros
*/
#ifdef _MSC_VER
#define UNUSED __pragma(warning(suppress : 4101))
#else
#define UNUSED __attribute__((unused))
#endif
#define START_LEXER() \
bool result = false; \
bool skip = false; \
UNUSED \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define ADVANCE_MAP(...) \
{ \
static const uint16_t map[] = { __VA_ARGS__ }; \
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \
if (map[i] == lookahead) { \
state = map[i + 1]; \
goto next_state; \
} \
} \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = (state_value) \
} \
}}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = (state_value), \
.repetition = true \
} \
}}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}}
#define REDUCE(symbol_name, children, precedence, prod_id) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_name, \
.child_count = children, \
.dynamic_precedence = precedence, \
.production_id = prod_id \
}, \
}}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

View File

@ -0,0 +1,48 @@
#ifndef TREE_SITTER_POINT_H_
#define TREE_SITTER_POINT_H_
#include "tree_sitter/api.h"
#define POINT_ZERO ((TSPoint) {0, 0})
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
static inline TSPoint point__new(unsigned row, unsigned column) {
TSPoint result = {row, column};
return result;
}
static inline TSPoint point_add(TSPoint a, TSPoint b) {
if (b.row > 0)
return point__new(a.row + b.row, b.column);
else
return point__new(a.row, a.column + b.column);
}
static inline TSPoint point_sub(TSPoint a, TSPoint b) {
if (a.row > b.row)
return point__new(a.row - b.row, a.column);
else
return point__new(0, (a.column >= b.column) ? a.column - b.column : 0);
}
static inline bool point_lte(TSPoint a, TSPoint b) {
return (a.row < b.row) || (a.row == b.row && a.column <= b.column);
}
static inline bool point_lt(TSPoint a, TSPoint b) {
return (a.row < b.row) || (a.row == b.row && a.column < b.column);
}
static inline bool point_gt(TSPoint a, TSPoint b) {
return (a.row > b.row) || (a.row == b.row && a.column > b.column);
}
static inline bool point_gte(TSPoint a, TSPoint b) {
return (a.row > b.row) || (a.row == b.row && a.column >= b.column);
}
static inline bool point_eq(TSPoint a, TSPoint b) {
return a.row == b.row && a.column == b.column;
}
#endif

View File

@ -0,0 +1,239 @@
// "License": Public Domain
// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like.
// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to
// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it
// an example on how to get the endian conversion functions on different platforms.
// updates from https://github.com/mikepb/endian.h/issues/4
#ifndef ENDIAN_H
#define ENDIAN_H
#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__)
# define __WINDOWS__
#endif
#if defined(HAVE_ENDIAN_H) || \
defined(__linux__) || \
defined(__GNU__) || \
defined(__illumos__) || \
defined(__NetBSD__) || \
defined(__OpenBSD__) || \
defined(__CYGWIN__) || \
defined(__MSYS__) || \
defined(__EMSCRIPTEN__) || \
defined(__wasi__)
#if defined(__NetBSD__)
#define _NETBSD_SOURCE 1
#endif
# include <endian.h>
#elif defined(HAVE_SYS_ENDIAN_H) || \
defined(__FreeBSD__) || \
defined(__DragonFly__)
# include <sys/endian.h>
#elif defined(__APPLE__)
# define __BYTE_ORDER BYTE_ORDER
# define __BIG_ENDIAN BIG_ENDIAN
# define __LITTLE_ENDIAN LITTLE_ENDIAN
# define __PDP_ENDIAN PDP_ENDIAN
# if !defined(_POSIX_C_SOURCE)
# include <libkern/OSByteOrder.h>
# define htobe16(x) OSSwapHostToBigInt16(x)
# define htole16(x) OSSwapHostToLittleInt16(x)
# define be16toh(x) OSSwapBigToHostInt16(x)
# define le16toh(x) OSSwapLittleToHostInt16(x)
# define htobe32(x) OSSwapHostToBigInt32(x)
# define htole32(x) OSSwapHostToLittleInt32(x)
# define be32toh(x) OSSwapBigToHostInt32(x)
# define le32toh(x) OSSwapLittleToHostInt32(x)
# define htobe64(x) OSSwapHostToBigInt64(x)
# define htole64(x) OSSwapHostToLittleInt64(x)
# define be64toh(x) OSSwapBigToHostInt64(x)
# define le64toh(x) OSSwapLittleToHostInt64(x)
# else
# if BYTE_ORDER == LITTLE_ENDIAN
# define htobe16(x) __builtin_bswap16(x)
# define htole16(x) (x)
# define be16toh(x) __builtin_bswap16(x)
# define le16toh(x) (x)
# define htobe32(x) __builtin_bswap32(x)
# define htole32(x) (x)
# define be32toh(x) __builtin_bswap32(x)
# define le32toh(x) (x)
# define htobe64(x) __builtin_bswap64(x)
# define htole64(x) (x)
# define be64toh(x) __builtin_bswap64(x)
# define le64toh(x) (x)
# elif BYTE_ORDER == BIG_ENDIAN
# define htobe16(x) (x)
# define htole16(x) __builtin_bswap16(x)
# define be16toh(x) (x)
# define le16toh(x) __builtin_bswap16(x)
# define htobe32(x) (x)
# define htole32(x) __builtin_bswap32(x)
# define be32toh(x) (x)
# define le32toh(x) __builtin_bswap32(x)
# define htobe64(x) (x)
# define htole64(x) __builtin_bswap64(x)
# define be64toh(x) (x)
# define le64toh(x) __builtin_bswap64(x)
# else
# error byte order not supported
# endif
# endif
#elif defined(__WINDOWS__)
# if defined(_MSC_VER) && !defined(__clang__)
# include <stdlib.h>
# define B_SWAP_16(x) _byteswap_ushort(x)
# define B_SWAP_32(x) _byteswap_ulong(x)
# define B_SWAP_64(x) _byteswap_uint64(x)
# else
# define B_SWAP_16(x) __builtin_bswap16(x)
# define B_SWAP_32(x) __builtin_bswap32(x)
# define B_SWAP_64(x) __builtin_bswap64(x)
# endif
# if defined(__MINGW32__) || defined(HAVE_SYS_PARAM_H)
# include <sys/param.h>
# endif
# ifndef BIG_ENDIAN
# ifdef __BIG_ENDIAN
# define BIG_ENDIAN __BIG_ENDIAN
# elif defined(__ORDER_BIG_ENDIAN__)
# define BIG_ENDIAN __ORDER_BIG_ENDIAN__
# else
# define BIG_ENDIAN 4321
# endif
# endif
# ifndef LITTLE_ENDIAN
# ifdef __LITTLE_ENDIAN
# define LITTLE_ENDIAN __LITTLE_ENDIAN
# elif defined(__ORDER_LITTLE_ENDIAN__)
# define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
# else
# define LITTLE_ENDIAN 1234
# endif
# endif
# ifndef BYTE_ORDER
# ifdef __BYTE_ORDER
# define BYTE_ORDER __BYTE_ORDER
# elif defined(__BYTE_ORDER__)
# define BYTE_ORDER __BYTE_ORDER__
# else
/* assume LE on Windows if nothing was defined */
# define BYTE_ORDER LITTLE_ENDIAN
# endif
# endif
# if BYTE_ORDER == LITTLE_ENDIAN
# define htobe16(x) B_SWAP_16(x)
# define htole16(x) (x)
# define be16toh(x) B_SWAP_16(x)
# define le16toh(x) (x)
# define htobe32(x) B_SWAP_32(x)
# define htole32(x) (x)
# define be32toh(x) B_SWAP_32(x)
# define le32toh(x) (x)
# define htobe64(x) B_SWAP_64(x)
# define htole64(x) (x)
# define be64toh(x) B_SWAP_64(x)
# define le64toh(x) (x)
# elif BYTE_ORDER == BIG_ENDIAN
# define htobe16(x) (x)
# define htole16(x) B_SWAP_16(x)
# define be16toh(x) (x)
# define le16toh(x) B_SWAP_16(x)
# define htobe32(x) (x)
# define htole32(x) B_SWAP_32(x)
# define be32toh(x) (x)
# define le32toh(x) B_SWAP_32(x)
# define htobe64(x) (x)
# define htole64(x) B_SWAP_64(x)
# define be64toh(x) (x)
# define le64toh(x) B_SWAP_64(x)
# else
# error byte order not supported
# endif
#elif defined(__QNXNTO__)
# include <gulliver.h>
# define __LITTLE_ENDIAN 1234
# define __BIG_ENDIAN 4321
# define __PDP_ENDIAN 3412
# if defined(__BIGENDIAN__)
# define __BYTE_ORDER __BIG_ENDIAN
# define htobe16(x) (x)
# define htobe32(x) (x)
# define htobe64(x) (x)
# define htole16(x) ENDIAN_SWAP16(x)
# define htole32(x) ENDIAN_SWAP32(x)
# define htole64(x) ENDIAN_SWAP64(x)
# elif defined(__LITTLEENDIAN__)
# define __BYTE_ORDER __LITTLE_ENDIAN
# define htole16(x) (x)
# define htole32(x) (x)
# define htole64(x) (x)
# define htobe16(x) ENDIAN_SWAP16(x)
# define htobe32(x) ENDIAN_SWAP32(x)
# define htobe64(x) ENDIAN_SWAP64(x)
# else
# error byte order not supported
# endif
# define be16toh(x) ENDIAN_BE16(x)
# define be32toh(x) ENDIAN_BE32(x)
# define be64toh(x) ENDIAN_BE64(x)
# define le16toh(x) ENDIAN_LE16(x)
# define le32toh(x) ENDIAN_LE32(x)
# define le64toh(x) ENDIAN_LE64(x)
#else
# error platform not supported
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
#ifndef TREE_SITTER_REDUCE_ACTION_H_
#define TREE_SITTER_REDUCE_ACTION_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./array.h"
#include "tree_sitter/api.h"
typedef struct {
uint32_t count;
TSSymbol symbol;
int dynamic_precedence;
unsigned short production_id;
} ReduceAction;
typedef Array(ReduceAction) ReduceActionSet;
static inline void ts_reduce_action_set_add(ReduceActionSet *self,
ReduceAction new_action) {
for (uint32_t i = 0; i < self->size; i++) {
ReduceAction action = self->contents[i];
if (action.symbol == new_action.symbol && action.count == new_action.count)
return;
}
array_push(self, new_action);
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_REDUCE_ACTION_H_

View File

@ -0,0 +1,95 @@
#include "./subtree.h"
typedef struct {
Subtree tree;
uint32_t child_index;
uint32_t byte_offset;
} StackEntry;
typedef struct {
Array(StackEntry) stack;
Subtree last_external_token;
} ReusableNode;
static inline ReusableNode reusable_node_new(void) {
return (ReusableNode) {array_new(), NULL_SUBTREE};
}
static inline void reusable_node_clear(ReusableNode *self) {
array_clear(&self->stack);
self->last_external_token = NULL_SUBTREE;
}
static inline Subtree reusable_node_tree(ReusableNode *self) {
return self->stack.size > 0
? self->stack.contents[self->stack.size - 1].tree
: NULL_SUBTREE;
}
static inline uint32_t reusable_node_byte_offset(ReusableNode *self) {
return self->stack.size > 0
? self->stack.contents[self->stack.size - 1].byte_offset
: UINT32_MAX;
}
static inline void reusable_node_delete(ReusableNode *self) {
array_delete(&self->stack);
}
static inline void reusable_node_advance(ReusableNode *self) {
StackEntry last_entry = *array_back(&self->stack);
uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree);
if (ts_subtree_has_external_tokens(last_entry.tree)) {
self->last_external_token = ts_subtree_last_external_token(last_entry.tree);
}
Subtree tree;
uint32_t next_index;
do {
StackEntry popped_entry = array_pop(&self->stack);
next_index = popped_entry.child_index + 1;
if (self->stack.size == 0) return;
tree = array_back(&self->stack)->tree;
} while (ts_subtree_child_count(tree) <= next_index);
array_push(&self->stack, ((StackEntry) {
.tree = ts_subtree_children(tree)[next_index],
.child_index = next_index,
.byte_offset = byte_offset,
}));
}
static inline bool reusable_node_descend(ReusableNode *self) {
StackEntry last_entry = *array_back(&self->stack);
if (ts_subtree_child_count(last_entry.tree) > 0) {
array_push(&self->stack, ((StackEntry) {
.tree = ts_subtree_children(last_entry.tree)[0],
.child_index = 0,
.byte_offset = last_entry.byte_offset,
}));
return true;
} else {
return false;
}
}
static inline void reusable_node_advance_past_leaf(ReusableNode *self) {
while (reusable_node_descend(self)) {}
reusable_node_advance(self);
}
static inline void reusable_node_reset(ReusableNode *self, Subtree tree) {
reusable_node_clear(self);
array_push(&self->stack, ((StackEntry) {
.tree = tree,
.child_index = 0,
.byte_offset = 0,
}));
// Never reuse the root node, because it has a non-standard internal structure
// due to transformations that are applied when it is accepted: adding the EOF
// child and any extra children.
if (!reusable_node_descend(self)) {
reusable_node_clear(self);
}
}

View File

@ -0,0 +1,912 @@
#include "./alloc.h"
#include "./language.h"
#include "./subtree.h"
#include "./array.h"
#include "./stack.h"
#include "./length.h"
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#define MAX_LINK_COUNT 8
#define MAX_NODE_POOL_SIZE 50
#define MAX_ITERATOR_COUNT 64
#if defined _WIN32 && !defined __GNUC__
#define forceinline __forceinline
#else
#define forceinline static inline __attribute__((always_inline))
#endif
typedef struct StackNode StackNode;
typedef struct {
StackNode *node;
Subtree subtree;
bool is_pending;
} StackLink;
struct StackNode {
TSStateId state;
Length position;
StackLink links[MAX_LINK_COUNT];
short unsigned int link_count;
uint32_t ref_count;
unsigned error_cost;
unsigned node_count;
int dynamic_precedence;
};
typedef struct {
StackNode *node;
SubtreeArray subtrees;
uint32_t subtree_count;
bool is_pending;
} StackIterator;
typedef Array(StackNode *) StackNodeArray;
typedef enum {
StackStatusActive,
StackStatusPaused,
StackStatusHalted,
} StackStatus;
typedef struct {
StackNode *node;
StackSummary *summary;
unsigned node_count_at_last_error;
Subtree last_external_token;
Subtree lookahead_when_paused;
StackStatus status;
} StackHead;
struct Stack {
Array(StackHead) heads;
StackSliceArray slices;
Array(StackIterator) iterators;
StackNodeArray node_pool;
StackNode *base_node;
SubtreePool *subtree_pool;
};
typedef unsigned StackAction;
enum {
StackActionNone,
StackActionStop = 1,
StackActionPop = 2,
};
typedef StackAction (*StackCallback)(void *, const StackIterator *);
static void stack_node_retain(StackNode *self) {
if (!self)
return;
ts_assert(self->ref_count > 0);
self->ref_count++;
ts_assert(self->ref_count != 0);
}
static void stack_node_release(
StackNode *self,
StackNodeArray *pool,
SubtreePool *subtree_pool
) {
recur:
ts_assert(self->ref_count != 0);
self->ref_count--;
if (self->ref_count > 0) return;
StackNode *first_predecessor = NULL;
if (self->link_count > 0) {
for (unsigned i = self->link_count - 1; i > 0; i--) {
StackLink link = self->links[i];
if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree);
stack_node_release(link.node, pool, subtree_pool);
}
StackLink link = self->links[0];
if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree);
first_predecessor = self->links[0].node;
}
if (pool->size < MAX_NODE_POOL_SIZE) {
array_push(pool, self);
} else {
ts_free(self);
}
if (first_predecessor) {
self = first_predecessor;
goto recur;
}
}
/// Get the number of nodes in the subtree, for the purpose of measuring
/// how much progress has been made by a given version of the stack.
static uint32_t stack__subtree_node_count(Subtree subtree) {
uint32_t count = ts_subtree_visible_descendant_count(subtree);
if (ts_subtree_visible(subtree)) count++;
// Count intermediate error nodes even though they are not visible,
// because a stack version's node count is used to check whether it
// has made any progress since the last time it encountered an error.
if (ts_subtree_symbol(subtree) == ts_builtin_sym_error_repeat) count++;
return count;
}
static StackNode *stack_node_new(
StackNode *previous_node,
Subtree subtree,
bool is_pending,
TSStateId state,
StackNodeArray *pool
) {
StackNode *node = pool->size > 0
? array_pop(pool)
: ts_malloc(sizeof(StackNode));
*node = (StackNode) {
.ref_count = 1,
.link_count = 0,
.state = state
};
if (previous_node) {
node->link_count = 1;
node->links[0] = (StackLink) {
.node = previous_node,
.subtree = subtree,
.is_pending = is_pending,
};
node->position = previous_node->position;
node->error_cost = previous_node->error_cost;
node->dynamic_precedence = previous_node->dynamic_precedence;
node->node_count = previous_node->node_count;
if (subtree.ptr) {
node->error_cost += ts_subtree_error_cost(subtree);
node->position = length_add(node->position, ts_subtree_total_size(subtree));
node->node_count += stack__subtree_node_count(subtree);
node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree);
}
} else {
node->position = length_zero();
node->error_cost = 0;
}
return node;
}
static bool stack__subtree_is_equivalent(Subtree left, Subtree right) {
if (left.ptr == right.ptr) return true;
if (!left.ptr || !right.ptr) return false;
// Symbols must match
if (ts_subtree_symbol(left) != ts_subtree_symbol(right)) return false;
// If both have errors, don't bother keeping both.
if (ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) return true;
return (
ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes &&
ts_subtree_size(left).bytes == ts_subtree_size(right).bytes &&
ts_subtree_child_count(left) == ts_subtree_child_count(right) &&
ts_subtree_extra(left) == ts_subtree_extra(right) &&
ts_subtree_external_scanner_state_eq(left, right)
);
}
static void stack_node_add_link(
StackNode *self,
StackLink link,
SubtreePool *subtree_pool
) {
if (link.node == self) return;
for (int i = 0; i < self->link_count; i++) {
StackLink *existing_link = &self->links[i];
if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) {
// In general, we preserve ambiguities until they are removed from the stack
// during a pop operation where multiple paths lead to the same node. But in
// the special case where two links directly connect the same pair of nodes,
// we can safely remove the ambiguity ahead of time without changing behavior.
if (existing_link->node == link.node) {
if (
ts_subtree_dynamic_precedence(link.subtree) >
ts_subtree_dynamic_precedence(existing_link->subtree)
) {
ts_subtree_retain(link.subtree);
ts_subtree_release(subtree_pool, existing_link->subtree);
existing_link->subtree = link.subtree;
self->dynamic_precedence =
link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree);
}
return;
}
// If the previous nodes are mergeable, merge them recursively.
if (
existing_link->node->state == link.node->state &&
existing_link->node->position.bytes == link.node->position.bytes &&
existing_link->node->error_cost == link.node->error_cost
) {
for (int j = 0; j < link.node->link_count; j++) {
stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool);
}
int32_t dynamic_precedence = link.node->dynamic_precedence;
if (link.subtree.ptr) {
dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree);
}
if (dynamic_precedence > self->dynamic_precedence) {
self->dynamic_precedence = dynamic_precedence;
}
return;
}
}
}
if (self->link_count == MAX_LINK_COUNT) return;
stack_node_retain(link.node);
unsigned node_count = link.node->node_count;
int dynamic_precedence = link.node->dynamic_precedence;
self->links[self->link_count++] = link;
if (link.subtree.ptr) {
ts_subtree_retain(link.subtree);
node_count += stack__subtree_node_count(link.subtree);
dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree);
}
if (node_count > self->node_count) self->node_count = node_count;
if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence;
}
static void stack_head_delete(
StackHead *self,
StackNodeArray *pool,
SubtreePool *subtree_pool
) {
if (self->node) {
if (self->last_external_token.ptr) {
ts_subtree_release(subtree_pool, self->last_external_token);
}
if (self->lookahead_when_paused.ptr) {
ts_subtree_release(subtree_pool, self->lookahead_when_paused);
}
if (self->summary) {
array_delete(self->summary);
ts_free(self->summary);
}
stack_node_release(self->node, pool, subtree_pool);
}
}
static StackVersion ts_stack__add_version(
Stack *self,
StackVersion original_version,
StackNode *node
) {
StackHead head = {
.node = node,
.node_count_at_last_error = array_get(&self->heads, original_version)->node_count_at_last_error,
.last_external_token = array_get(&self->heads, original_version)->last_external_token,
.status = StackStatusActive,
.lookahead_when_paused = NULL_SUBTREE,
};
array_push(&self->heads, head);
stack_node_retain(node);
if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token);
return (StackVersion)(self->heads.size - 1);
}
static void ts_stack__add_slice(
Stack *self,
StackVersion original_version,
StackNode *node,
SubtreeArray *subtrees
) {
for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) {
StackVersion version = array_get(&self->slices, i)->version;
if (array_get(&self->heads, version)->node == node) {
StackSlice slice = {*subtrees, version};
array_insert(&self->slices, i + 1, slice);
return;
}
}
StackVersion version = ts_stack__add_version(self, original_version, node);
StackSlice slice = { *subtrees, version };
array_push(&self->slices, slice);
}
static StackSliceArray stack__iter(
Stack *self,
StackVersion version,
StackCallback callback,
void *payload,
int goal_subtree_count
) {
array_clear(&self->slices);
array_clear(&self->iterators);
StackHead *head = array_get(&self->heads, version);
StackIterator new_iterator = {
.node = head->node,
.subtrees = array_new(),
.subtree_count = 0,
.is_pending = true,
};
bool include_subtrees = false;
if (goal_subtree_count >= 0) {
include_subtrees = true;
array_reserve(&new_iterator.subtrees, (uint32_t)ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree));
}
array_push(&self->iterators, new_iterator);
while (self->iterators.size > 0) {
for (uint32_t i = 0, size = self->iterators.size; i < size; i++) {
StackIterator *iterator = array_get(&self->iterators, i);
StackNode *node = iterator->node;
StackAction action = callback(payload, iterator);
bool should_pop = action & StackActionPop;
bool should_stop = action & StackActionStop || node->link_count == 0;
if (should_pop) {
SubtreeArray subtrees = iterator->subtrees;
if (!should_stop) {
ts_subtree_array_copy(subtrees, &subtrees);
}
ts_subtree_array_reverse(&subtrees);
ts_stack__add_slice(
self,
version,
node,
&subtrees
);
}
if (should_stop) {
if (!should_pop) {
ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees);
}
array_erase(&self->iterators, i);
i--, size--;
continue;
}
for (uint32_t j = 1; j <= node->link_count; j++) {
StackIterator *next_iterator;
StackLink link;
if (j == node->link_count) {
link = node->links[0];
next_iterator = array_get(&self->iterators, i);
} else {
if (self->iterators.size >= MAX_ITERATOR_COUNT) continue;
link = node->links[j];
StackIterator current_iterator = *array_get(&self->iterators, i);
array_push(&self->iterators, current_iterator);
next_iterator = array_back(&self->iterators);
ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees);
}
next_iterator->node = link.node;
if (link.subtree.ptr) {
if (include_subtrees) {
array_push(&next_iterator->subtrees, link.subtree);
ts_subtree_retain(link.subtree);
}
if (!ts_subtree_extra(link.subtree)) {
next_iterator->subtree_count++;
if (!link.is_pending) {
next_iterator->is_pending = false;
}
}
} else {
next_iterator->subtree_count++;
next_iterator->is_pending = false;
}
}
}
}
return self->slices;
}
Stack *ts_stack_new(SubtreePool *subtree_pool) {
Stack *self = ts_calloc(1, sizeof(Stack));
array_init(&self->heads);
array_init(&self->slices);
array_init(&self->iterators);
array_init(&self->node_pool);
array_reserve(&self->heads, 4);
array_reserve(&self->slices, 4);
array_reserve(&self->iterators, 4);
array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE);
self->subtree_pool = subtree_pool;
self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool);
ts_stack_clear(self);
return self;
}
void ts_stack_delete(Stack *self) {
if (self->slices.contents)
array_delete(&self->slices);
if (self->iterators.contents)
array_delete(&self->iterators);
stack_node_release(self->base_node, &self->node_pool, self->subtree_pool);
for (uint32_t i = 0; i < self->heads.size; i++) {
stack_head_delete(array_get(&self->heads, i), &self->node_pool, self->subtree_pool);
}
array_clear(&self->heads);
if (self->node_pool.contents) {
for (uint32_t i = 0; i < self->node_pool.size; i++)
ts_free(*array_get(&self->node_pool, i));
array_delete(&self->node_pool);
}
array_delete(&self->heads);
ts_free(self);
}
uint32_t ts_stack_version_count(const Stack *self) {
return self->heads.size;
}
uint32_t ts_stack_halted_version_count(Stack *self) {
uint32_t count = 0;
for (uint32_t i = 0; i < self->heads.size; i++) {
StackHead *head = array_get(&self->heads, i);
if (head->status == StackStatusHalted) {
count++;
}
}
return count;
}
TSStateId ts_stack_state(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->state;
}
Length ts_stack_position(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->position;
}
Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->last_external_token;
}
void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) {
StackHead *head = array_get(&self->heads, version);
if (token.ptr) ts_subtree_retain(token);
if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token);
head->last_external_token = token;
}
unsigned ts_stack_error_cost(const Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
unsigned result = head->node->error_cost;
if (
head->status == StackStatusPaused ||
(head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) {
result += ERROR_COST_PER_RECOVERY;
}
return result;
}
unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
if (head->node->node_count < head->node_count_at_last_error) {
head->node_count_at_last_error = head->node->node_count;
}
return head->node->node_count - head->node_count_at_last_error;
}
void ts_stack_push(
Stack *self,
StackVersion version,
Subtree subtree,
bool pending,
TSStateId state
) {
StackHead *head = array_get(&self->heads, version);
StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool);
if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count;
head->node = new_node;
}
forceinline StackAction pop_count_callback(void *payload, const StackIterator *iterator) {
unsigned *goal_subtree_count = payload;
if (iterator->subtree_count == *goal_subtree_count) {
return StackActionPop | StackActionStop;
} else {
return StackActionNone;
}
}
StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) {
return stack__iter(self, version, pop_count_callback, &count, (int)count);
}
forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) {
(void)payload;
if (iterator->subtree_count >= 1) {
if (iterator->is_pending) {
return StackActionPop | StackActionStop;
} else {
return StackActionStop;
}
} else {
return StackActionNone;
}
}
StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) {
StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0);
if (pop.size > 0) {
ts_stack_renumber_version(self, array_get(&pop, 0)->version, version);
array_get(&pop, 0)->version = version;
}
return pop;
}
forceinline StackAction pop_error_callback(void *payload, const StackIterator *iterator) {
if (iterator->subtrees.size > 0) {
bool *found_error = payload;
if (!*found_error && ts_subtree_is_error(*array_get(&iterator->subtrees, 0))) {
*found_error = true;
return StackActionPop | StackActionStop;
} else {
return StackActionStop;
}
} else {
return StackActionNone;
}
}
SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) {
StackNode *node = array_get(&self->heads, version)->node;
for (unsigned i = 0; i < node->link_count; i++) {
if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) {
bool found_error = false;
StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1);
if (pop.size > 0) {
ts_assert(pop.size == 1);
ts_stack_renumber_version(self, array_get(&pop, 0)->version, version);
return array_get(&pop, 0)->subtrees;
}
break;
}
}
return (SubtreeArray) {.size = 0};
}
forceinline StackAction pop_all_callback(void *payload, const StackIterator *iterator) {
(void)payload;
return iterator->node->link_count == 0 ? StackActionPop : StackActionNone;
}
StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) {
return stack__iter(self, version, pop_all_callback, NULL, 0);
}
typedef struct {
StackSummary *summary;
unsigned max_depth;
} SummarizeStackSession;
forceinline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) {
SummarizeStackSession *session = payload;
TSStateId state = iterator->node->state;
unsigned depth = iterator->subtree_count;
if (depth > session->max_depth) return StackActionStop;
for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) {
StackSummaryEntry entry = *array_get(session->summary, i);
if (entry.depth < depth) break;
if (entry.depth == depth && entry.state == state) return StackActionNone;
}
array_push(session->summary, ((StackSummaryEntry) {
.position = iterator->node->position,
.depth = depth,
.state = state,
}));
return StackActionNone;
}
void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) {
SummarizeStackSession session = {
.summary = ts_malloc(sizeof(StackSummary)),
.max_depth = max_depth
};
array_init(session.summary);
stack__iter(self, version, summarize_stack_callback, &session, -1);
StackHead *head = array_get(&self->heads, version);
if (head->summary) {
array_delete(head->summary);
ts_free(head->summary);
}
head->summary = session.summary;
}
StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) {
return array_get(&self->heads, version)->summary;
}
int ts_stack_dynamic_precedence(Stack *self, StackVersion version) {
return array_get(&self->heads, version)->node->dynamic_precedence;
}
bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) {
const StackHead *head = array_get(&self->heads, version);
const StackNode *node = head->node;
if (node->error_cost == 0) return true;
while (node) {
if (node->link_count > 0) {
Subtree subtree = node->links[0].subtree;
if (subtree.ptr) {
if (ts_subtree_total_bytes(subtree) > 0) {
return true;
} else if (
node->node_count > head->node_count_at_last_error &&
ts_subtree_error_cost(subtree) == 0
) {
node = node->links[0].node;
continue;
}
}
}
break;
}
return false;
}
void ts_stack_remove_version(Stack *self, StackVersion version) {
stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool);
array_erase(&self->heads, version);
}
void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) {
if (v1 == v2) return;
ts_assert(v2 < v1);
ts_assert((uint32_t)v1 < self->heads.size);
StackHead *source_head = array_get(&self->heads, v1);
StackHead *target_head = array_get(&self->heads, v2);
if (target_head->summary && !source_head->summary) {
source_head->summary = target_head->summary;
target_head->summary = NULL;
}
stack_head_delete(target_head, &self->node_pool, self->subtree_pool);
*target_head = *source_head;
array_erase(&self->heads, v1);
}
void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) {
StackHead temporary_head = *array_get(&self->heads, v1);
*array_get(&self->heads, v1) = *array_get(&self->heads, v2);
*array_get(&self->heads, v2) = temporary_head;
}
StackVersion ts_stack_copy_version(Stack *self, StackVersion version) {
ts_assert(version < self->heads.size);
StackHead version_head = *array_get(&self->heads, version);
array_push(&self->heads, version_head);
StackHead *head = array_back(&self->heads);
stack_node_retain(head->node);
if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token);
head->summary = NULL;
return self->heads.size - 1;
}
bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) {
if (!ts_stack_can_merge(self, version1, version2)) return false;
StackHead *head1 = array_get(&self->heads, version1);
StackHead *head2 = array_get(&self->heads, version2);
for (uint32_t i = 0; i < head2->node->link_count; i++) {
stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool);
}
if (head1->node->state == ERROR_STATE) {
head1->node_count_at_last_error = head1->node->node_count;
}
ts_stack_remove_version(self, version2);
return true;
}
bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) {
StackHead *head1 = array_get(&self->heads, version1);
StackHead *head2 = array_get(&self->heads, version2);
return
head1->status == StackStatusActive &&
head2->status == StackStatusActive &&
head1->node->state == head2->node->state &&
head1->node->position.bytes == head2->node->position.bytes &&
head1->node->error_cost == head2->node->error_cost &&
ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token);
}
void ts_stack_halt(Stack *self, StackVersion version) {
array_get(&self->heads, version)->status = StackStatusHalted;
}
void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead) {
StackHead *head = array_get(&self->heads, version);
head->status = StackStatusPaused;
head->lookahead_when_paused = lookahead;
head->node_count_at_last_error = head->node->node_count;
}
bool ts_stack_is_active(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusActive;
}
bool ts_stack_is_halted(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusHalted;
}
bool ts_stack_is_paused(const Stack *self, StackVersion version) {
return array_get(&self->heads, version)->status == StackStatusPaused;
}
Subtree ts_stack_resume(Stack *self, StackVersion version) {
StackHead *head = array_get(&self->heads, version);
ts_assert(head->status == StackStatusPaused);
Subtree result = head->lookahead_when_paused;
head->status = StackStatusActive;
head->lookahead_when_paused = NULL_SUBTREE;
return result;
}
void ts_stack_clear(Stack *self) {
stack_node_retain(self->base_node);
for (uint32_t i = 0; i < self->heads.size; i++) {
stack_head_delete(array_get(&self->heads, i), &self->node_pool, self->subtree_pool);
}
array_clear(&self->heads);
array_push(&self->heads, ((StackHead) {
.node = self->base_node,
.status = StackStatusActive,
.last_external_token = NULL_SUBTREE,
.lookahead_when_paused = NULL_SUBTREE,
}));
}
bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) {
array_reserve(&self->iterators, 32);
if (!f) f = stderr;
fprintf(f, "digraph stack {\n");
fprintf(f, "rankdir=\"RL\";\n");
fprintf(f, "edge [arrowhead=none]\n");
Array(StackNode *) visited_nodes = array_new();
array_clear(&self->iterators);
for (uint32_t i = 0; i < self->heads.size; i++) {
StackHead *head = array_get(&self->heads, i);
if (head->status == StackStatusHalted) continue;
fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i);
fprintf(f, "node_head_%u -> node_%p [", i, (void *)head->node);
if (head->status == StackStatusPaused) {
fprintf(f, "color=red ");
}
fprintf(f,
"label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u",
i,
ts_stack_node_count_since_error(self, i),
ts_stack_error_cost(self, i)
);
if (head->summary) {
fprintf(f, "\nsummary:");
for (uint32_t j = 0; j < head->summary->size; j++) fprintf(f, " %u", array_get(head->summary, j)->state);
}
if (head->last_external_token.ptr) {
const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state;
const char *data = ts_external_scanner_state_data(state);
fprintf(f, "\nexternal_scanner_state:");
for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]);
}
fprintf(f, "\"]\n");
array_push(&self->iterators, ((StackIterator) {
.node = head->node
}));
}
bool all_iterators_done = false;
while (!all_iterators_done) {
all_iterators_done = true;
for (uint32_t i = 0; i < self->iterators.size; i++) {
StackIterator iterator = *array_get(&self->iterators, i);
StackNode *node = iterator.node;
for (uint32_t j = 0; j < visited_nodes.size; j++) {
if (*array_get(&visited_nodes, j) == node) {
node = NULL;
break;
}
}
if (!node) continue;
all_iterators_done = false;
fprintf(f, "node_%p [", (void *)node);
if (node->state == ERROR_STATE) {
fprintf(f, "label=\"?\"");
} else if (
node->link_count == 1 &&
node->links[0].subtree.ptr &&
ts_subtree_extra(node->links[0].subtree)
) {
fprintf(f, "shape=point margin=0 label=\"\"");
} else {
fprintf(f, "label=\"%d\"", node->state);
}
fprintf(
f,
" tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n",
node->position.extent.row + 1,
node->position.extent.column,
node->node_count,
node->error_cost,
node->dynamic_precedence
);
for (int j = 0; j < node->link_count; j++) {
StackLink link = node->links[j];
fprintf(f, "node_%p -> node_%p [", (void *)node, (void *)link.node);
if (link.is_pending) fprintf(f, "style=dashed ");
if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray ");
if (!link.subtree.ptr) {
fprintf(f, "color=red");
} else {
fprintf(f, "label=\"");
bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree);
if (quoted) fprintf(f, "'");
ts_language_write_symbol_as_dot_string(language, f, ts_subtree_symbol(link.subtree));
if (quoted) fprintf(f, "'");
fprintf(f, "\"");
fprintf(
f,
"labeltooltip=\"error_cost: %u\ndynamic_precedence: %" PRId32 "\"",
ts_subtree_error_cost(link.subtree),
ts_subtree_dynamic_precedence(link.subtree)
);
}
fprintf(f, "];\n");
StackIterator *next_iterator;
if (j == 0) {
next_iterator = array_get(&self->iterators, i);
} else {
array_push(&self->iterators, iterator);
next_iterator = array_back(&self->iterators);
}
next_iterator->node = link.node;
}
array_push(&visited_nodes, node);
}
}
fprintf(f, "}\n");
array_delete(&visited_nodes);
return true;
}
#undef forceinline

View File

@ -0,0 +1,133 @@
#ifndef TREE_SITTER_PARSE_STACK_H_
#define TREE_SITTER_PARSE_STACK_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./array.h"
#include "./subtree.h"
#include <stdio.h>
typedef struct Stack Stack;
typedef unsigned StackVersion;
#define STACK_VERSION_NONE ((StackVersion)-1)
typedef struct {
SubtreeArray subtrees;
StackVersion version;
} StackSlice;
typedef Array(StackSlice) StackSliceArray;
typedef struct {
Length position;
unsigned depth;
TSStateId state;
} StackSummaryEntry;
typedef Array(StackSummaryEntry) StackSummary;
// Create a stack.
Stack *ts_stack_new(SubtreePool *subtree_pool);
// Release the memory reserved for a given stack.
void ts_stack_delete(Stack *self);
// Get the stack's current number of versions.
uint32_t ts_stack_version_count(const Stack *self);
// Get the stack's current number of halted versions.
uint32_t ts_stack_halted_version_count(Stack *self);
// Get the state at the top of the given version of the stack. If the stack is
// empty, this returns the initial state, 0.
TSStateId ts_stack_state(const Stack *self, StackVersion version);
// Get the last external token associated with a given version of the stack.
Subtree ts_stack_last_external_token(const Stack *self, StackVersion version);
// Set the last external token associated with a given version of the stack.
void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token);
// Get the position of the given version of the stack within the document.
Length ts_stack_position(const Stack *, StackVersion);
// Push a tree and state onto the given version of the stack.
//
// This transfers ownership of the tree to the Stack. Callers that
// need to retain ownership of the tree for their own purposes should
// first retain the tree.
void ts_stack_push(Stack *self, StackVersion version, Subtree subtree, bool pending, TSStateId state);
// Pop the given number of entries from the given version of the stack. This
// operation can increase the number of stack versions by revealing multiple
// versions which had previously been merged. It returns an array that
// specifies the index of each revealed version and the trees that were
// removed from that version.
StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count);
// Remove an error at the top of the given version of the stack.
SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version);
// Remove any pending trees from the top of the given version of the stack.
StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version);
// Remove all trees from the given version of the stack.
StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version);
// Get the maximum number of tree nodes reachable from this version of the stack
// since the last error was detected.
unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version);
int ts_stack_dynamic_precedence(Stack *self, StackVersion version);
bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version);
// Compute a summary of all the parse states near the top of the given
// version of the stack and store the summary for later retrieval.
void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth);
// Retrieve a summary of all the parse states near the top of the
// given version of the stack.
StackSummary *ts_stack_get_summary(Stack *self, StackVersion version);
// Get the total cost of all errors on the given version of the stack.
unsigned ts_stack_error_cost(const Stack *self, StackVersion version);
// Merge the given two stack versions if possible, returning true
// if they were successfully merged and false otherwise.
bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2);
// Determine whether the given two stack versions can be merged.
bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2);
Subtree ts_stack_resume(Stack *self, StackVersion version);
void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead);
void ts_stack_halt(Stack *self, StackVersion version);
bool ts_stack_is_active(const Stack *self, StackVersion version);
bool ts_stack_is_paused(const Stack *self, StackVersion version);
bool ts_stack_is_halted(const Stack *self, StackVersion version);
void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2);
void ts_stack_swap_versions(Stack *, StackVersion v1, StackVersion v2);
StackVersion ts_stack_copy_version(Stack *self, StackVersion version);
// Remove the given version from the stack.
void ts_stack_remove_version(Stack *self, StackVersion version);
void ts_stack_clear(Stack *self);
bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSE_STACK_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,399 @@
#ifndef TREE_SITTER_SUBTREE_H_
#define TREE_SITTER_SUBTREE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include "./length.h"
#include "./array.h"
#include "./error_costs.h"
#include "./host.h"
#include "tree_sitter/api.h"
#include "./parser.h"
#define TS_TREE_STATE_NONE USHRT_MAX
#define NULL_SUBTREE ((Subtree) {.ptr = NULL})
// The serialized state of an external scanner.
//
// Every time an external token subtree is created after a call to an
// external scanner, the scanner's `serialize` function is called to
// retrieve a serialized copy of its state. The bytes are then copied
// onto the subtree itself so that the scanner's state can later be
// restored using its `deserialize` function.
//
// Small byte arrays are stored inline, and long ones are allocated
// separately on the heap.
typedef struct {
union {
char *long_data;
char short_data[24];
};
uint32_t length;
} ExternalScannerState;
// A compact representation of a subtree.
//
// This representation is used for small leaf nodes that are not
// errors, and were not created by an external scanner.
//
// The idea behind the layout of this struct is that the `is_inline`
// bit will fall exactly into the same location as the least significant
// bit of the pointer in `Subtree` or `MutableSubtree`, respectively.
// Because of alignment, for any valid pointer this will be 0, giving
// us the opportunity to make use of this bit to signify whether to use
// the pointer or the inline struct.
typedef struct SubtreeInlineData SubtreeInlineData;
#define SUBTREE_BITS \
bool visible : 1; \
bool named : 1; \
bool extra : 1; \
bool has_changes : 1; \
bool is_missing : 1; \
bool is_keyword : 1;
#define SUBTREE_SIZE \
uint8_t padding_columns; \
uint8_t padding_rows : 4; \
uint8_t lookahead_bytes : 4; \
uint8_t padding_bytes; \
uint8_t size_bytes;
#if TS_BIG_ENDIAN
#if TS_PTR_SIZE == 32
struct SubtreeInlineData {
uint16_t parse_state;
uint8_t symbol;
SUBTREE_BITS
bool unused : 1;
bool is_inline : 1;
SUBTREE_SIZE
};
#else
struct SubtreeInlineData {
SUBTREE_SIZE
uint16_t parse_state;
uint8_t symbol;
SUBTREE_BITS
bool unused : 1;
bool is_inline : 1;
};
#endif
#else
struct SubtreeInlineData {
bool is_inline : 1;
SUBTREE_BITS
uint8_t symbol;
uint16_t parse_state;
SUBTREE_SIZE
};
#endif
#undef SUBTREE_BITS
#undef SUBTREE_SIZE
// A heap-allocated representation of a subtree.
//
// This representation is used for parent nodes, external tokens,
// errors, and other leaf nodes whose data is too large to fit into
// the inline representation.
typedef struct {
volatile uint32_t ref_count;
Length padding;
Length size;
uint32_t lookahead_bytes;
uint32_t error_cost;
uint32_t child_count;
TSSymbol symbol;
TSStateId parse_state;
bool visible : 1;
bool named : 1;
bool extra : 1;
bool fragile_left : 1;
bool fragile_right : 1;
bool has_changes : 1;
bool has_external_tokens : 1;
bool has_external_scanner_state_change : 1;
bool depends_on_column: 1;
bool is_missing : 1;
bool is_keyword : 1;
union {
// Non-terminal subtrees (`child_count > 0`)
struct {
uint32_t visible_child_count;
uint32_t named_child_count;
uint32_t visible_descendant_count;
int32_t dynamic_precedence;
uint16_t repeat_depth;
uint16_t production_id;
struct {
TSSymbol symbol;
TSStateId parse_state;
} first_leaf;
};
// External terminal subtrees (`child_count == 0 && has_external_tokens`)
ExternalScannerState external_scanner_state;
// Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`)
int32_t lookahead_char;
};
} SubtreeHeapData;
// The fundamental building block of a syntax tree.
typedef union {
SubtreeInlineData data;
const SubtreeHeapData *ptr;
} Subtree;
// Like Subtree, but mutable.
typedef union {
SubtreeInlineData data;
SubtreeHeapData *ptr;
} MutableSubtree;
typedef Array(Subtree) SubtreeArray;
typedef Array(MutableSubtree) MutableSubtreeArray;
typedef struct {
MutableSubtreeArray free_trees;
MutableSubtreeArray tree_stack;
} SubtreePool;
void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length);
const char *ts_external_scanner_state_data(const ExternalScannerState *self);
bool ts_external_scanner_state_eq(const ExternalScannerState *self, const char *buffer, unsigned length);
void ts_external_scanner_state_delete(ExternalScannerState *self);
void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest);
void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self);
void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self);
void ts_subtree_array_remove_trailing_extras(SubtreeArray *self, SubtreeArray *destination);
void ts_subtree_array_reverse(SubtreeArray *self);
SubtreePool ts_subtree_pool_new(uint32_t capacity);
void ts_subtree_pool_delete(SubtreePool *self);
Subtree ts_subtree_new_leaf(
SubtreePool *pool, TSSymbol symbol, Length padding, Length size,
uint32_t lookahead_bytes, TSStateId parse_state,
bool has_external_tokens, bool depends_on_column,
bool is_keyword, const TSLanguage *language
);
Subtree ts_subtree_new_error(
SubtreePool *pool, int32_t lookahead_char, Length padding, Length size,
uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language
);
MutableSubtree ts_subtree_new_node(
TSSymbol symbol,
SubtreeArray *chiildren,
unsigned production_id,
const TSLanguage *language
);
Subtree ts_subtree_new_error_node(
SubtreeArray *children,
bool extra,
const TSLanguage * language
);
Subtree ts_subtree_new_missing_leaf(
SubtreePool *pool,
TSSymbol symbol,
Length padding,
uint32_t lookahead_bytes,
const TSLanguage *language
);
MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self);
void ts_subtree_retain(Subtree self);
void ts_subtree_release(SubtreePool *pool, Subtree self);
int ts_subtree_compare(Subtree left, Subtree right, SubtreePool *pool);
void ts_subtree_set_symbol(MutableSubtree *self, TSSymbol symbol, const TSLanguage *language);
void ts_subtree_compress(MutableSubtree self, unsigned count, const TSLanguage *language, MutableSubtreeArray *stack);
void ts_subtree_summarize_children(MutableSubtree self, const TSLanguage *language);
Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool);
char *ts_subtree_string(Subtree self, TSSymbol alias_symbol, bool alias_is_named, const TSLanguage *language, bool include_all);
void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f);
Subtree ts_subtree_last_external_token(Subtree tree);
const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self);
bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other);
#define SUBTREE_GET(self, name) ((self).data.is_inline ? (self).data.name : (self).ptr->name)
static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); }
static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); }
static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); }
static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); }
static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); }
static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); }
static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); }
static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); }
static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); }
#undef SUBTREE_GET
// Get the size needed to store a heap-allocated subtree with the given
// number of children.
static inline size_t ts_subtree_alloc_size(uint32_t child_count) {
return child_count * sizeof(Subtree) + sizeof(SubtreeHeapData);
}
// Get a subtree's children, which are allocated immediately before the
// tree's own heap data.
#define ts_subtree_children(self) \
((self).data.is_inline ? NULL : (Subtree *)((self).ptr) - (self).ptr->child_count)
static inline void ts_subtree_set_extra(MutableSubtree *self, bool is_extra) {
if (self->data.is_inline) {
self->data.extra = is_extra;
} else {
self->ptr->extra = is_extra;
}
}
static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) {
if (self.data.is_inline) return self.data.symbol;
if (self.ptr->child_count == 0) return self.ptr->symbol;
return self.ptr->first_leaf.symbol;
}
static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) {
if (self.data.is_inline) return self.data.parse_state;
if (self.ptr->child_count == 0) return self.ptr->parse_state;
return self.ptr->first_leaf.parse_state;
}
static inline Length ts_subtree_padding(Subtree self) {
if (self.data.is_inline) {
Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}};
return result;
} else {
return self.ptr->padding;
}
}
static inline Length ts_subtree_size(Subtree self) {
if (self.data.is_inline) {
Length result = {self.data.size_bytes, {0, self.data.size_bytes}};
return result;
} else {
return self.ptr->size;
}
}
static inline Length ts_subtree_total_size(Subtree self) {
return length_add(ts_subtree_padding(self), ts_subtree_size(self));
}
static inline uint32_t ts_subtree_total_bytes(Subtree self) {
return ts_subtree_total_size(self).bytes;
}
static inline uint32_t ts_subtree_child_count(Subtree self) {
return self.data.is_inline ? 0 : self.ptr->child_count;
}
static inline uint32_t ts_subtree_repeat_depth(Subtree self) {
return self.data.is_inline ? 0 : self.ptr->repeat_depth;
}
static inline uint32_t ts_subtree_is_repetition(Subtree self) {
return self.data.is_inline
? 0
: !self.ptr->named && !self.ptr->visible && self.ptr->child_count != 0;
}
static inline uint32_t ts_subtree_visible_descendant_count(Subtree self) {
return (self.data.is_inline || self.ptr->child_count == 0)
? 0
: self.ptr->visible_descendant_count;
}
static inline uint32_t ts_subtree_visible_child_count(Subtree self) {
if (ts_subtree_child_count(self) > 0) {
return self.ptr->visible_child_count;
} else {
return 0;
}
}
static inline uint32_t ts_subtree_error_cost(Subtree self) {
if (ts_subtree_missing(self)) {
return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY;
} else {
return self.data.is_inline ? 0 : self.ptr->error_cost;
}
}
static inline int32_t ts_subtree_dynamic_precedence(Subtree self) {
return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence;
}
static inline uint16_t ts_subtree_production_id(Subtree self) {
if (ts_subtree_child_count(self) > 0) {
return self.ptr->production_id;
} else {
return 0;
}
}
static inline bool ts_subtree_fragile_left(Subtree self) {
return self.data.is_inline ? false : self.ptr->fragile_left;
}
static inline bool ts_subtree_fragile_right(Subtree self) {
return self.data.is_inline ? false : self.ptr->fragile_right;
}
static inline bool ts_subtree_has_external_tokens(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_tokens;
}
static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) {
return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change;
}
static inline bool ts_subtree_depends_on_column(Subtree self) {
return self.data.is_inline ? false : self.ptr->depends_on_column;
}
static inline bool ts_subtree_is_fragile(Subtree self) {
return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right);
}
static inline bool ts_subtree_is_error(Subtree self) {
return ts_subtree_symbol(self) == ts_builtin_sym_error;
}
static inline bool ts_subtree_is_eof(Subtree self) {
return ts_subtree_symbol(self) == ts_builtin_sym_end;
}
static inline Subtree ts_subtree_from_mut(MutableSubtree self) {
Subtree result;
result.data = self.data;
return result;
}
static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) {
MutableSubtree result;
result.data = self.data;
return result;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_SUBTREE_H_

View File

@ -0,0 +1,170 @@
#include "tree_sitter/api.h"
#include "./array.h"
#include "./get_changed_ranges.h"
#include "./length.h"
#include "./subtree.h"
#include "./tree_cursor.h"
#include "./tree.h"
TSTree *ts_tree_new(
Subtree root, const TSLanguage *language,
const TSRange *included_ranges, unsigned included_range_count
) {
TSTree *result = ts_malloc(sizeof(TSTree));
result->root = root;
result->language = ts_language_copy(language);
result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange));
memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange));
result->included_range_count = included_range_count;
return result;
}
TSTree *ts_tree_copy(const TSTree *self) {
ts_subtree_retain(self->root);
return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count);
}
void ts_tree_delete(TSTree *self) {
if (!self) return;
SubtreePool pool = ts_subtree_pool_new(0);
ts_subtree_release(&pool, self->root);
ts_subtree_pool_delete(&pool);
ts_language_delete(self->language);
ts_free(self->included_ranges);
ts_free(self);
}
TSNode ts_tree_root_node(const TSTree *self) {
return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0);
}
TSNode ts_tree_root_node_with_offset(
const TSTree *self,
uint32_t offset_bytes,
TSPoint offset_extent
) {
Length offset = {offset_bytes, offset_extent};
return ts_node_new(self, &self->root, length_add(offset, ts_subtree_padding(self->root)), 0);
}
const TSLanguage *ts_tree_language(const TSTree *self) {
return self->language;
}
void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *range = &self->included_ranges[i];
if (range->end_byte >= edit->old_end_byte) {
if (range->end_byte != UINT32_MAX) {
range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte);
range->end_point = point_add(
edit->new_end_point,
point_sub(range->end_point, edit->old_end_point)
);
if (range->end_byte < edit->new_end_byte) {
range->end_byte = UINT32_MAX;
range->end_point = POINT_MAX;
}
}
} else if (range->end_byte > edit->start_byte) {
range->end_byte = edit->start_byte;
range->end_point = edit->start_point;
}
if (range->start_byte >= edit->old_end_byte) {
range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte);
range->start_point = point_add(
edit->new_end_point,
point_sub(range->start_point, edit->old_end_point)
);
if (range->start_byte < edit->new_end_byte) {
range->start_byte = UINT32_MAX;
range->start_point = POINT_MAX;
}
} else if (range->start_byte > edit->start_byte) {
range->start_byte = edit->start_byte;
range->start_point = edit->start_point;
}
}
SubtreePool pool = ts_subtree_pool_new(0);
self->root = ts_subtree_edit(self->root, edit, &pool);
ts_subtree_pool_delete(&pool);
}
TSRange *ts_tree_included_ranges(const TSTree *self, uint32_t *length) {
*length = self->included_range_count;
TSRange *ranges = ts_calloc(self->included_range_count, sizeof(TSRange));
memcpy(ranges, self->included_ranges, self->included_range_count * sizeof(TSRange));
return ranges;
}
TSRange *ts_tree_get_changed_ranges(const TSTree *old_tree, const TSTree *new_tree, uint32_t *length) {
TreeCursor cursor1 = {NULL, array_new(), 0};
TreeCursor cursor2 = {NULL, array_new(), 0};
ts_tree_cursor_init(&cursor1, ts_tree_root_node(old_tree));
ts_tree_cursor_init(&cursor2, ts_tree_root_node(new_tree));
TSRangeArray included_range_differences = array_new();
ts_range_array_get_changed_ranges(
old_tree->included_ranges, old_tree->included_range_count,
new_tree->included_ranges, new_tree->included_range_count,
&included_range_differences
);
TSRange *result;
*length = ts_subtree_get_changed_ranges(
&old_tree->root, &new_tree->root, &cursor1, &cursor2,
old_tree->language, &included_range_differences, &result
);
array_delete(&included_range_differences);
array_delete(&cursor1.stack);
array_delete(&cursor2.stack);
return result;
}
#ifdef _WIN32
#include <io.h>
#include <windows.h>
int _ts_dup(HANDLE handle) {
HANDLE dup_handle;
if (!DuplicateHandle(
GetCurrentProcess(), handle,
GetCurrentProcess(), &dup_handle,
0, FALSE, DUPLICATE_SAME_ACCESS
)) return -1;
return _open_osfhandle((intptr_t)dup_handle, 0);
}
void ts_tree_print_dot_graph(const TSTree *self, int fd) {
FILE *file = _fdopen(_ts_dup((HANDLE)_get_osfhandle(fd)), "a");
ts_subtree_print_dot_graph(self->root, self->language, file);
fclose(file);
}
#elif !defined(__wasi__) // WASI doesn't support dup
#include <unistd.h>
int _ts_dup(int file_descriptor) {
return dup(file_descriptor);
}
void ts_tree_print_dot_graph(const TSTree *self, int file_descriptor) {
FILE *file = fdopen(_ts_dup(file_descriptor), "a");
ts_subtree_print_dot_graph(self->root, self->language, file);
fclose(file);
}
#else
void ts_tree_print_dot_graph(const TSTree *self, int file_descriptor) {
(void)self;
(void)file_descriptor;
}
#endif

View File

@ -0,0 +1,31 @@
#ifndef TREE_SITTER_TREE_H_
#define TREE_SITTER_TREE_H_
#include "./subtree.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const Subtree *child;
const Subtree *parent;
Length position;
TSSymbol alias_symbol;
} ParentCacheEntry;
struct TSTree {
Subtree root;
const TSLanguage *language;
TSRange *included_ranges;
unsigned included_range_count;
};
TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *included_ranges, unsigned included_range_count);
TSNode ts_node_new(const TSTree *tree, const Subtree *subtree, Length position, TSSymbol alias);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_TREE_H_

View File

@ -0,0 +1,716 @@
#include "tree_sitter/api.h"
#include "./tree_cursor.h"
#include "./language.h"
#include "./tree.h"
typedef struct {
Subtree parent;
const TSTree *tree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
uint32_t descendant_index;
const TSSymbol *alias_sequence;
} CursorChildIterator;
// CursorChildIterator
static inline bool ts_tree_cursor_is_entry_visible(const TreeCursor *self, uint32_t index) {
TreeCursorEntry *entry = array_get(&self->stack, index);
if (index == 0 || ts_subtree_visible(*entry->subtree)) {
return true;
} else if (!ts_subtree_extra(*entry->subtree)) {
TreeCursorEntry *parent_entry = array_get(&self->stack, index - 1);
return ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
entry->structural_child_index
);
} else {
return false;
}
}
static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) {
TreeCursorEntry *last_entry = array_back(&self->stack);
if (ts_subtree_child_count(*last_entry->subtree) == 0) {
return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, 0, NULL};
}
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
last_entry->subtree->ptr->production_id
);
uint32_t descendant_index = last_entry->descendant_index;
if (ts_tree_cursor_is_entry_visible(self, self->stack.size - 1)) {
descendant_index += 1;
}
return (CursorChildIterator) {
.tree = self->tree,
.parent = *last_entry->subtree,
.position = last_entry->position,
.child_index = 0,
.structural_child_index = 0,
.descendant_index = descendant_index,
.alias_sequence = alias_sequence,
};
}
static inline bool ts_tree_cursor_child_iterator_next(
CursorChildIterator *self,
TreeCursorEntry *result,
bool *visible
) {
if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false;
const Subtree *child = &ts_subtree_children(self->parent)[self->child_index];
*result = (TreeCursorEntry) {
.subtree = child,
.position = self->position,
.child_index = self->child_index,
.structural_child_index = self->structural_child_index,
.descendant_index = self->descendant_index,
};
*visible = ts_subtree_visible(*child);
bool extra = ts_subtree_extra(*child);
if (!extra) {
if (self->alias_sequence) {
*visible |= self->alias_sequence[self->structural_child_index];
}
self->structural_child_index++;
}
self->descendant_index += ts_subtree_visible_descendant_count(*child);
if (*visible) {
self->descendant_index += 1;
}
self->position = length_add(self->position, ts_subtree_size(*child));
self->child_index++;
if (self->child_index < self->parent.ptr->child_count) {
Subtree next_child = ts_subtree_children(self->parent)[self->child_index];
self->position = length_add(self->position, ts_subtree_padding(next_child));
}
return true;
}
// Return a position that, when `b` is added to it, yields `a`. This
// can only be computed if `b` has zero rows. Otherwise, this function
// returns `LENGTH_UNDEFINED`, and the caller needs to recompute
// the position some other way.
static inline Length length_backtrack(Length a, Length b) {
if (length_is_undefined(a) || b.extent.row != 0) {
return LENGTH_UNDEFINED;
}
Length result;
result.bytes = a.bytes - b.bytes;
result.extent.row = a.extent.row;
result.extent.column = a.extent.column - b.extent.column;
return result;
}
static inline bool ts_tree_cursor_child_iterator_previous(
CursorChildIterator *self,
TreeCursorEntry *result,
bool *visible
) {
// this is mostly a reverse `ts_tree_cursor_child_iterator_next` taking into
// account unsigned underflow
if (!self->parent.ptr || (int8_t)self->child_index == -1) return false;
const Subtree *child = &ts_subtree_children(self->parent)[self->child_index];
*result = (TreeCursorEntry) {
.subtree = child,
.position = self->position,
.child_index = self->child_index,
.structural_child_index = self->structural_child_index,
};
*visible = ts_subtree_visible(*child);
bool extra = ts_subtree_extra(*child);
self->position = length_backtrack(self->position, ts_subtree_padding(*child));
self->child_index--;
if (!extra && self->alias_sequence) {
*visible |= self->alias_sequence[self->structural_child_index];
if (self->structural_child_index > 0) {
self->structural_child_index--;
}
}
// unsigned can underflow so compare it to child_count
if (self->child_index < self->parent.ptr->child_count) {
Subtree previous_child = ts_subtree_children(self->parent)[self->child_index];
Length size = ts_subtree_size(previous_child);
self->position = length_backtrack(self->position, size);
}
return true;
}
// TSTreeCursor - lifecycle
TSTreeCursor ts_tree_cursor_new(TSNode node) {
TSTreeCursor self = {NULL, NULL, {0, 0, 0}};
ts_tree_cursor_init((TreeCursor *)&self, node);
return self;
}
void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) {
ts_tree_cursor_init((TreeCursor *)_self, node);
}
void ts_tree_cursor_init(TreeCursor *self, TSNode node) {
self->tree = node.tree;
self->root_alias_symbol = node.context[3];
array_clear(&self->stack);
array_push(&self->stack, ((TreeCursorEntry) {
.subtree = (const Subtree *)node.id,
.position = {
ts_node_start_byte(node),
ts_node_start_point(node)
},
.child_index = 0,
.structural_child_index = 0,
.descendant_index = 0,
}));
}
void ts_tree_cursor_delete(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
array_delete(&self->stack);
}
// TSTreeCursor - walking the tree
TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
if (visible) {
array_push(&self->stack, entry);
return TreeCursorStepVisible;
}
if (ts_subtree_visible_child_count(*entry.subtree) > 0) {
array_push(&self->stack, entry);
return TreeCursorStepHidden;
}
}
return TreeCursorStepNone;
}
bool ts_tree_cursor_goto_first_child(TSTreeCursor *self) {
for (;;) {
switch (ts_tree_cursor_goto_first_child_internal(self)) {
case TreeCursorStepHidden:
continue;
case TreeCursorStepVisible:
return true;
default:
return false;
}
}
}
TreeCursorStep ts_tree_cursor_goto_last_child_internal(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
if (!iterator.parent.ptr || iterator.parent.ptr->child_count == 0) return TreeCursorStepNone;
TreeCursorEntry last_entry = {0};
TreeCursorStep last_step = TreeCursorStepNone;
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
if (visible) {
last_entry = entry;
last_step = TreeCursorStepVisible;
}
else if (ts_subtree_visible_child_count(*entry.subtree) > 0) {
last_entry = entry;
last_step = TreeCursorStepHidden;
}
}
if (last_entry.subtree) {
array_push(&self->stack, last_entry);
return last_step;
}
return TreeCursorStepNone;
}
bool ts_tree_cursor_goto_last_child(TSTreeCursor *self) {
for (;;) {
switch (ts_tree_cursor_goto_last_child_internal(self)) {
case TreeCursorStepHidden:
continue;
case TreeCursorStepVisible:
return true;
default:
return false;
}
}
}
static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point(
TSTreeCursor *_self,
uint32_t goal_byte,
TSPoint goal_point
) {
TreeCursor *self = (TreeCursor *)_self;
uint32_t initial_size = self->stack.size;
uint32_t visible_child_index = 0;
bool did_descend;
do {
did_descend = false;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
Length entry_end = length_add(entry.position, ts_subtree_size(*entry.subtree));
bool at_goal = entry_end.bytes > goal_byte && point_gt(entry_end.extent, goal_point);
uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree);
if (at_goal) {
if (visible) {
array_push(&self->stack, entry);
return visible_child_index;
}
if (visible_child_count > 0) {
array_push(&self->stack, entry);
did_descend = true;
break;
}
} else if (visible) {
visible_child_index++;
} else {
visible_child_index += visible_child_count;
}
}
} while (did_descend);
self->stack.size = initial_size;
return -1;
}
int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *self, uint32_t goal_byte) {
return ts_tree_cursor_goto_first_child_for_byte_and_point(self, goal_byte, POINT_ZERO);
}
int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint goal_point) {
return ts_tree_cursor_goto_first_child_for_byte_and_point(self, 0, goal_point);
}
TreeCursorStep ts_tree_cursor_goto_sibling_internal(
TSTreeCursor *_self,
bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)
) {
TreeCursor *self = (TreeCursor *)_self;
uint32_t initial_size = self->stack.size;
while (self->stack.size > 1) {
TreeCursorEntry entry = array_pop(&self->stack);
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
iterator.child_index = entry.child_index;
iterator.structural_child_index = entry.structural_child_index;
iterator.position = entry.position;
iterator.descendant_index = entry.descendant_index;
bool visible = false;
advance(&iterator, &entry, &visible);
if (visible && self->stack.size + 1 < initial_size) break;
while (advance(&iterator, &entry, &visible)) {
if (visible) {
array_push(&self->stack, entry);
return TreeCursorStepVisible;
}
if (ts_subtree_visible_child_count(*entry.subtree)) {
array_push(&self->stack, entry);
return TreeCursorStepHidden;
}
}
}
self->stack.size = initial_size;
return TreeCursorStepNone;
}
TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self) {
return ts_tree_cursor_goto_sibling_internal(_self, ts_tree_cursor_child_iterator_next);
}
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self) {
switch (ts_tree_cursor_goto_next_sibling_internal(self)) {
case TreeCursorStepHidden:
ts_tree_cursor_goto_first_child(self);
return true;
case TreeCursorStepVisible:
return true;
default:
return false;
}
}
TreeCursorStep ts_tree_cursor_goto_previous_sibling_internal(TSTreeCursor *_self) {
// since subtracting across row loses column information, we may have to
// restore it
TreeCursor *self = (TreeCursor *)_self;
// for that, save current position before traversing
TreeCursorStep step = ts_tree_cursor_goto_sibling_internal(
_self, ts_tree_cursor_child_iterator_previous);
if (step == TreeCursorStepNone)
return step;
// if length is already valid, there's no need to recompute it
if (!length_is_undefined(array_back(&self->stack)->position))
return step;
// restore position from the parent node
const TreeCursorEntry *parent = array_get(&self->stack, self->stack.size - 2);
Length position = parent->position;
uint32_t child_index = array_back(&self->stack)->child_index;
const Subtree *children = ts_subtree_children((*(parent->subtree)));
if (child_index > 0) {
// skip first child padding since its position should match the position of the parent
position = length_add(position, ts_subtree_size(children[0]));
for (uint32_t i = 1; i < child_index; ++i) {
position = length_add(position, ts_subtree_total_size(children[i]));
}
position = length_add(position, ts_subtree_padding(children[child_index]));
}
array_back(&self->stack)->position = position;
return step;
}
bool ts_tree_cursor_goto_previous_sibling(TSTreeCursor *self) {
switch (ts_tree_cursor_goto_previous_sibling_internal(self)) {
case TreeCursorStepHidden:
ts_tree_cursor_goto_last_child(self);
return true;
case TreeCursorStepVisible:
return true;
default:
return false;
}
}
bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) {
if (ts_tree_cursor_is_entry_visible(self, i)) {
self->stack.size = i + 1;
return true;
}
}
return false;
}
void ts_tree_cursor_goto_descendant(
TSTreeCursor *_self,
uint32_t goal_descendant_index
) {
TreeCursor *self = (TreeCursor *)_self;
// Ascend to the lowest ancestor that contains the goal node.
for (;;) {
uint32_t i = self->stack.size - 1;
TreeCursorEntry *entry = array_get(&self->stack, i);
uint32_t next_descendant_index =
entry->descendant_index +
(ts_tree_cursor_is_entry_visible(self, i) ? 1 : 0) +
ts_subtree_visible_descendant_count(*entry->subtree);
if (
(entry->descendant_index <= goal_descendant_index) &&
(next_descendant_index > goal_descendant_index)
) {
break;
} else if (self->stack.size <= 1) {
return;
} else {
self->stack.size--;
}
}
// Descend to the goal node.
bool did_descend = true;
do {
did_descend = false;
bool visible;
TreeCursorEntry entry;
CursorChildIterator iterator = ts_tree_cursor_iterate_children(self);
if (iterator.descendant_index > goal_descendant_index) {
return;
}
while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) {
if (iterator.descendant_index > goal_descendant_index) {
array_push(&self->stack, entry);
if (visible && entry.descendant_index == goal_descendant_index) {
return;
} else {
did_descend = true;
break;
}
}
}
} while (did_descend);
}
uint32_t ts_tree_cursor_current_descendant_index(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
TreeCursorEntry *last_entry = array_back(&self->stack);
return last_entry->descendant_index;
}
TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
TreeCursorEntry *last_entry = array_back(&self->stack);
bool is_extra = ts_subtree_extra(*last_entry->subtree);
TSSymbol alias_symbol = is_extra ? 0 : self->root_alias_symbol;
if (self->stack.size > 1 && !is_extra) {
TreeCursorEntry *parent_entry = array_get(&self->stack, self->stack.size - 2);
alias_symbol = ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
last_entry->structural_child_index
);
}
return ts_node_new(
self->tree,
last_entry->subtree,
last_entry->position,
alias_symbol
);
}
// Private - Get various facts about the current node that are needed
// when executing tree queries.
void ts_tree_cursor_current_status(
const TSTreeCursor *_self,
TSFieldId *field_id,
bool *has_later_siblings,
bool *has_later_named_siblings,
bool *can_have_later_siblings_with_this_field,
TSSymbol *supertypes,
unsigned *supertype_count
) {
const TreeCursor *self = (const TreeCursor *)_self;
unsigned max_supertypes = *supertype_count;
*field_id = 0;
*supertype_count = 0;
*has_later_siblings = false;
*has_later_named_siblings = false;
*can_have_later_siblings_with_this_field = false;
// Walk up the tree, visiting the current node and its invisible ancestors,
// because fields can refer to nodes through invisible *wrapper* nodes,
for (unsigned i = self->stack.size - 1; i > 0; i--) {
TreeCursorEntry *entry = array_get(&self->stack, i);
TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1);
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
#define subtree_symbol(subtree, structural_child_index) \
(( \
!ts_subtree_extra(subtree) && \
alias_sequence && \
alias_sequence[structural_child_index] \
) ? \
alias_sequence[structural_child_index] : \
ts_subtree_symbol(subtree))
// Stop walking up when a visible ancestor is found.
TSSymbol entry_symbol = subtree_symbol(
*entry->subtree,
entry->structural_child_index
);
TSSymbolMetadata entry_metadata = ts_language_symbol_metadata(
self->tree->language,
entry_symbol
);
if (i != self->stack.size - 1 && entry_metadata.visible) break;
// Record any supertypes
if (entry_metadata.supertype && *supertype_count < max_supertypes) {
supertypes[*supertype_count] = entry_symbol;
(*supertype_count)++;
}
// Determine if the current node has later siblings.
if (!*has_later_siblings) {
unsigned sibling_count = parent_entry->subtree->ptr->child_count;
unsigned structural_child_index = entry->structural_child_index;
if (!ts_subtree_extra(*entry->subtree)) structural_child_index++;
for (unsigned j = entry->child_index + 1; j < sibling_count; j++) {
Subtree sibling = ts_subtree_children(*parent_entry->subtree)[j];
TSSymbolMetadata sibling_metadata = ts_language_symbol_metadata(
self->tree->language,
subtree_symbol(sibling, structural_child_index)
);
if (sibling_metadata.visible) {
*has_later_siblings = true;
if (*has_later_named_siblings) break;
if (sibling_metadata.named) {
*has_later_named_siblings = true;
break;
}
} else if (ts_subtree_visible_child_count(sibling) > 0) {
*has_later_siblings = true;
if (*has_later_named_siblings) break;
if (sibling.ptr->named_child_count > 0) {
*has_later_named_siblings = true;
break;
}
}
if (!ts_subtree_extra(sibling)) structural_child_index++;
}
}
#undef subtree_symbol
if (!ts_subtree_extra(*entry->subtree)) {
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
// Look for a field name associated with the current node.
if (!*field_id) {
for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) {
if (!map->inherited && map->child_index == entry->structural_child_index) {
*field_id = map->field_id;
break;
}
}
}
// Determine if the current node can have later siblings with the same field name.
if (*field_id) {
for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) {
if (
map->field_id == *field_id &&
map->child_index > entry->structural_child_index
) {
*can_have_later_siblings_with_this_field = true;
break;
}
}
}
}
}
}
uint32_t ts_tree_cursor_current_depth(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
uint32_t depth = 0;
for (unsigned i = 1; i < self->stack.size; i++) {
if (ts_tree_cursor_is_entry_visible(self, i)) {
depth++;
}
}
return depth;
}
TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
for (int i = (int)self->stack.size - 2; i >= 0; i--) {
TreeCursorEntry *entry = array_get(&self->stack, i);
bool is_visible = true;
TSSymbol alias_symbol = 0;
if (i > 0) {
TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1);
alias_symbol = ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
entry->structural_child_index
);
is_visible = (alias_symbol != 0) || ts_subtree_visible(*entry->subtree);
}
if (is_visible) {
return ts_node_new(
self->tree,
entry->subtree,
entry->position,
alias_symbol
);
}
}
return ts_node_new(NULL, NULL, length_zero(), 0);
}
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
// Walk up the tree, visiting the current node and its invisible ancestors.
for (unsigned i = self->stack.size - 1; i > 0; i--) {
TreeCursorEntry *entry = array_get(&self->stack, i);
TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1);
// Stop walking up when another visible node is found.
if (
i != self->stack.size - 1 &&
ts_tree_cursor_is_entry_visible(self, i)
) break;
if (ts_subtree_extra(*entry->subtree)) break;
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) {
if (!map->inherited && map->child_index == entry->structural_child_index) {
return map->field_id;
}
}
}
return 0;
}
const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) {
TSFieldId id = ts_tree_cursor_current_field_id(_self);
if (id) {
const TreeCursor *self = (const TreeCursor *)_self;
return self->tree->language->field_names[id];
} else {
return NULL;
}
}
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) {
const TreeCursor *cursor = (const TreeCursor *)_cursor;
TSTreeCursor res = {NULL, NULL, {0, 0}};
TreeCursor *copy = (TreeCursor *)&res;
copy->tree = cursor->tree;
copy->root_alias_symbol = cursor->root_alias_symbol;
array_init(&copy->stack);
array_push_all(&copy->stack, &cursor->stack);
return res;
}
void ts_tree_cursor_reset_to(TSTreeCursor *_dst, const TSTreeCursor *_src) {
const TreeCursor *cursor = (const TreeCursor *)_src;
TreeCursor *copy = (TreeCursor *)_dst;
copy->tree = cursor->tree;
copy->root_alias_symbol = cursor->root_alias_symbol;
array_clear(&copy->stack);
array_push_all(&copy->stack, &cursor->stack);
}

View File

@ -0,0 +1,48 @@
#ifndef TREE_SITTER_TREE_CURSOR_H_
#define TREE_SITTER_TREE_CURSOR_H_
#include "./subtree.h"
typedef struct {
const Subtree *subtree;
Length position;
uint32_t child_index;
uint32_t structural_child_index;
uint32_t descendant_index;
} TreeCursorEntry;
typedef struct {
const TSTree *tree;
Array(TreeCursorEntry) stack;
TSSymbol root_alias_symbol;
} TreeCursor;
typedef enum {
TreeCursorStepNone,
TreeCursorStepHidden,
TreeCursorStepVisible,
} TreeCursorStep;
void ts_tree_cursor_init(TreeCursor *self, TSNode node);
void ts_tree_cursor_current_status(
const TSTreeCursor *_self,
TSFieldId *field_id,
bool *has_later_siblings,
bool *has_later_named_siblings,
bool *can_have_later_siblings_with_this_field,
TSSymbol *supertypes,
unsigned *supertype_count
);
TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self);
TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self);
static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
TreeCursorEntry *last_entry = array_back(&self->stack);
return *last_entry->subtree;
}
TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self);
#endif // TREE_SITTER_TREE_CURSOR_H_

View File

@ -0,0 +1,11 @@
#ifndef TREE_SITTER_ASSERT_H_
#define TREE_SITTER_ASSERT_H_
#ifdef NDEBUG
#define ts_assert(e) ((void)(e))
#else
#include <assert.h>
#define ts_assert(e) assert(e)
#endif
#endif // TREE_SITTER_ASSERT_H_

View File

@ -0,0 +1,75 @@
#ifndef TREE_SITTER_UNICODE_H_
#define TREE_SITTER_UNICODE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <limits.h>
#include <stdint.h>
#define U_EXPORT
#define U_EXPORT2
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "portable/endian.h"
#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=le16toh((s)[(i)++]); \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=be16toh((s)[(i)++]); \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
static inline uint32_t ts_decode_utf8(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U8_NEXT(string, i, length, *code_point);
return i;
}
static inline uint32_t ts_decode_utf16_le(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT_LE(((uint16_t *)string), i, length, *code_point);
return i * 2;
}
static inline uint32_t ts_decode_utf16_be(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT_BE(((uint16_t *)string), i, length, *code_point);
return i * 2;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_UNICODE_H_

View File

@ -0,0 +1 @@
552b01f61127d30d6589aa4bf99468224979b661

View File

@ -0,0 +1,414 @@
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
---------------------
Third-Party Software Licenses
This section contains third-party software notices and/or additional
terms for licensed third-party software components included within ICU
libraries.
1. ICU License - ICU 1.8.1 to ICU 57.1
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2016 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies of
the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale, use
or other dealings in this Software without prior written authorization
of the copyright holder.
All trademarks and registered trademarks mentioned herein are the
property of their respective owners.
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
# The Google Chrome software developed by Google is licensed under
# the BSD license. Other software included in this distribution is
# provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution.
# Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists
# listed below with further processing for compound word breaking. The
# frequency is generated with an iterative training against Google web
# corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia
# * Sinica. All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
# University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END--------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN-------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END----------------------------------
3. Lao Word Break Dictionary Data (laodict.txt)
# Copyright (c) 2013 International Business Machines Corporation
# and others. All Rights Reserved.
#
# Project: http://code.google.com/p/lao-dictionary/
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
# (copied below)
#
# This file is derived from the above dictionary, with slight
# modifications.
# ----------------------------------------------------------------------
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification,
# are permitted provided that the following conditions are met:
#
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in
# binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------
4. Burmese Word Break Dictionary Data (burmesedict.txt)
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer. Redistributions in binary form must reproduce the
# above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
# --------------------------------------------------------------------------
5. Time Zone Database
ICU uses the public domain data and code derived from Time Zone
Database for its time zone support. The ownership of the TZ database
is explained in BCP 175: Procedure for Maintaining the Time Zone
Database section 7.
# 7. Database Ownership
#
# The TZ database itself is not an IETF Contribution or an IETF
# document. Rather it is a pre-existing and regularly updated work
# that is in the public domain, and is intended to remain in the
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
# not apply to the TZ Database or contributions that individuals make
# to it. Should any claims be made and substantiated against the TZ
# Database, the organization that is providing the IANA
# Considerations defined in this RFC, under the memorandum of
# understanding with the IETF, currently ICANN, may act in accordance
# with all competent court orders. No ownership claims will be made
# by ICANN or the IETF Trust on the database or the code. Any person
# making a contribution to the database or code waives all rights to
# future claims in that contribution or in the TZ Database.
6. Google double-conversion
Copyright 2006-2011, the V8 project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,29 @@
# ICU Parts
This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
### License
The license for these files is contained in the `LICENSE` file within this directory.
### Contents
* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
* `utf8.h`
* `utf16.h`
* `umachine.h`
* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
* `ptypes.h`
* `urename.h`
* `utf.h`
* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
* `README.md` - This text file.
### Updating ICU
To incorporate changes from the upstream `icu` repository:
* Update `ICU_SHA` with the new Git SHA.
* Update `LICENSE` with the license text from the directory mentioned above.
* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,448 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: umachine.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file defines basic types and constants for ICU to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*/
#ifndef __UMACHINE_H__
#define __UMACHINE_H__
/**
* \file
* \brief Basic types and constants for UTF
*
* <h2> Basic types and constants for UTF </h2>
* This file defines basic types and constants for utf.h to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*
*/
/*==========================================================================*/
/* Include platform-dependent definitions */
/* which are contained in the platform-specific file platform.h */
/*==========================================================================*/
#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */
/*
* ANSI C headers:
* stddef.h defines wchar_t
*/
#include <stddef.h>
/*==========================================================================*/
/* For C wrappers, we use the symbol U_STABLE. */
/* This works properly if the includer is C or C++. */
/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
/*==========================================================================*/
/**
* \def U_CFUNC
* This is used in a declaration of a library private ICU C function.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_BEGIN
* This is used to begin a declaration of a library private ICU C API.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_END
* This is used to end a declaration of a library private ICU C API
* @stable ICU 2.4
*/
#ifdef __cplusplus
# define U_CFUNC extern "C"
# define U_CDECL_BEGIN extern "C" {
# define U_CDECL_END }
#else
# define U_CFUNC extern
# define U_CDECL_BEGIN
# define U_CDECL_END
#endif
#ifndef U_ATTRIBUTE_DEPRECATED
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for GCC specific attributes
* @internal
*/
#if U_GCC_MAJOR_MINOR >= 302
# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for Visual C++ specific attributes
* @internal
*/
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
#else
# define U_ATTRIBUTE_DEPRECATED
#endif
#endif
/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
#define U_CAPI U_CFUNC U_EXPORT
/** This is used to declare a function as a stable public ICU C API*/
#define U_STABLE U_CAPI
/** This is used to declare a function as a draft public ICU C API */
#define U_DRAFT U_CAPI
/** This is used to declare a function as a deprecated public ICU C API */
#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
/** This is used to declare a function as an obsolete public ICU C API */
#define U_OBSOLETE U_CAPI
/** This is used to declare a function as an internal ICU C API */
#define U_INTERNAL U_CAPI
/**
* \def U_OVERRIDE
* Defined to the C++11 "override" keyword if available.
* Denotes a class or member which is an override of the base class.
* May result in an error if it applied to something not an override.
* @internal
*/
#ifndef U_OVERRIDE
#define U_OVERRIDE override
#endif
/**
* \def U_FINAL
* Defined to the C++11 "final" keyword if available.
* Denotes a class or member which may not be overridden in subclasses.
* May result in an error if subclasses attempt to override.
* @internal
*/
#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
#define U_FINAL final
#endif
// Before ICU 65, function-like, multi-statement ICU macros were just defined as
// series of statements wrapped in { } blocks and the caller could choose to
// either treat them as if they were actual functions and end the invocation
// with a trailing ; creating an empty statement after the block or else omit
// this trailing ; using the knowledge that the macro would expand to { }.
//
// But doing so doesn't work well with macros that look like functions and
// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
// switches to the standard solution of wrapping such macros in do { } while.
//
// This will however break existing code that depends on being able to invoke
// these macros without a trailing ; so to be able to remain compatible with
// such code the wrapper is itself defined as macros so that it's possible to
// build ICU 65 and later with the old macro behaviour, like this:
//
// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
// runConfigureICU ...
/**
* \def UPRV_BLOCK_MACRO_BEGIN
* Defined as the "do" keyword by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_BEGIN
#define UPRV_BLOCK_MACRO_BEGIN do
#endif
/**
* \def UPRV_BLOCK_MACRO_END
* Defined as "while (FALSE)" by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_END
#define UPRV_BLOCK_MACRO_END while (FALSE)
#endif
/*==========================================================================*/
/* limits for int32_t etc., like in POSIX inttypes.h */
/*==========================================================================*/
#ifndef INT8_MIN
/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MIN ((int8_t)(-128))
#endif
#ifndef INT16_MIN
/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MIN ((int16_t)(-32767-1))
#endif
#ifndef INT32_MIN
/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MIN ((int32_t)(-2147483647-1))
#endif
#ifndef INT8_MAX
/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MAX ((int8_t)(127))
#endif
#ifndef INT16_MAX
/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MAX ((int16_t)(32767))
#endif
#ifndef INT32_MAX
/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MAX ((int32_t)(2147483647))
#endif
#ifndef UINT8_MAX
/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT8_MAX ((uint8_t)(255U))
#endif
#ifndef UINT16_MAX
/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT16_MAX ((uint16_t)(65535U))
#endif
#ifndef UINT32_MAX
/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT32_MAX ((uint32_t)(4294967295U))
#endif
#if defined(U_INT64_T_UNAVAILABLE)
# error int64_t is required for decimal format and rule-based number format.
#else
# ifndef INT64_C
/**
* Provides a platform independent way to specify a signed 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
* @stable ICU 2.8
*/
# define INT64_C(c) c ## LL
# endif
# ifndef UINT64_C
/**
* Provides a platform independent way to specify an unsigned 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
* @stable ICU 2.8
*/
# define UINT64_C(c) c ## ULL
# endif
# ifndef U_INT64_MIN
/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
# endif
# ifndef U_INT64_MAX
/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
# endif
# ifndef U_UINT64_MAX
/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
# endif
#endif
/*==========================================================================*/
/* Boolean data type */
/*==========================================================================*/
/** The ICU boolean type @stable ICU 2.0 */
typedef int8_t UBool;
#ifndef TRUE
/** The TRUE value of a UBool @stable ICU 2.0 */
# define TRUE 1
#endif
#ifndef FALSE
/** The FALSE value of a UBool @stable ICU 2.0 */
# define FALSE 0
#endif
/*==========================================================================*/
/* Unicode data types */
/*==========================================================================*/
/* wchar_t-related definitions -------------------------------------------- */
/*
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
*
* @stable ICU 2.0
*/
/*
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
*
* @stable ICU 2.0
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
# if (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# elif (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif defined __UCS2__
# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# endif
# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
# if (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
# define U_WCHAR_IS_UTF32
# elif U_PLATFORM_HAS_WIN32_API
# define U_WCHAR_IS_UTF16
# endif
#endif
/* UChar and UChar32 definitions -------------------------------------------- */
/** Number of bytes in a UChar. @stable ICU 2.0 */
#define U_SIZEOF_UCHAR 2
/**
* \def U_CHAR16_IS_TYPEDEF
* If 1, then char16_t is a typedef and not a real type (yet)
* @internal
*/
#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
// for AIX, uchar.h needs to be included
# include <uchar.h>
# define U_CHAR16_IS_TYPEDEF 1
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
# define U_CHAR16_IS_TYPEDEF 1
#else
# define U_CHAR16_IS_TYPEDEF 0
#endif
/**
* \var UChar
*
* The base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
* Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
*
* UChar is configurable by defining the macro UCHAR_TYPE
* on the preprocessor or compiler command line:
* -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
* (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
* This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
*
* The default is UChar=char16_t.
*
* C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
*
* In C, char16_t is a simple typedef of uint_least16_t.
* ICU requires uint_least16_t=uint16_t for data memory mapping.
* On macOS, char16_t is not available because the uchar.h standard header is missing.
*
* @stable ICU 4.4
*/
#if 1
// #if 1 is normal. UChar defaults to char16_t in C++.
// For configuration testing of UChar=uint16_t temporarily change this to #if 0.
// The intltest Makefile #defines UCHAR_TYPE=char16_t,
// so we only #define it to uint16_t if it is undefined so far.
#elif !defined(UCHAR_TYPE)
# define UCHAR_TYPE uint16_t
#endif
#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
// Inside the ICU library code, never configurable.
typedef char16_t UChar;
#elif defined(UCHAR_TYPE)
typedef UCHAR_TYPE UChar;
#elif defined(__cplusplus)
typedef char16_t UChar;
#else
typedef uint16_t UChar;
#endif
/**
* \var OldUChar
* Default ICU 58 definition of UChar.
* A base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
*
* Define OldUChar to be wchar_t if that is 16 bits wide.
* If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
*
* This makes the definition of OldUChar platform-dependent
* but allows direct string type compatibility with platforms with
* 16-bit wchar_t types.
*
* This is how UChar was defined in ICU 58, for transition convenience.
* Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
* The current UChar responds to UCHAR_TYPE but OldUChar does not.
*
* @stable ICU 59
*/
#if U_SIZEOF_WCHAR_T==2
typedef wchar_t OldUChar;
#elif defined(__CHAR16_TYPE__)
typedef __CHAR16_TYPE__ OldUChar;
#else
typedef uint16_t OldUChar;
#endif
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer (same as int32_t).
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see U_SENTINEL
* @stable ICU 2.4
*/
typedef int32_t UChar32;
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @return -1
* @see UChar32
* @stable ICU 2.4
*/
#define U_SENTINEL (-1)
#include "unicode/urename.h"
#endif

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,733 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf16.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep09
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 16-bit Unicode handling macros
*
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF16_H__
#define __UTF16_H__
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit alone encode a code point (BMP, not a surrogate)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code unit a surrogate (U+d800..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a trail surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 4.2
*/
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
/**
* Helper constant for U16_GET_SUPPLEMENTARY.
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
/**
* Get the lead surrogate (0xd800..0xdbff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return lead surrogate (U+d800..U+dbff) for supplementary
* @stable ICU 2.4
*/
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
/**
* Get the trail surrogate (0xdc00..0xdfff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return trail surrogate (U+dc00..U+dfff) for supplementary
* @stable ICU 2.4
*/
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/**
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
* @param c 32-bit code point
* @return 1 or 2
* @stable ICU 2.4
*/
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @stable ICU 2.4
*/
#define U16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* The result is undefined if the offset points to a single, unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_GET
* @stable ICU 2.4
*/
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 2.4
*/
#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U16_APPEND
* @stable ICU 2.4
*/
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a surrogate pair is written, checks for sufficient space in the string.
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to TRUE.
*
* @param s const UChar * string buffer
* @param i string offset, must be i<capacity
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U16_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* c>0x10ffff or not enough space */ { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_FWD_1
* @stable ICU 2.4
*/
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @see U16_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_FWD_N
* @stable ICU 2.4
*/
#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U16_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U16_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_START
* @stable ICU 2.4
*/
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i
* @see U16_SET_CP_START_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_PREV
* @stable ICU 2.4
*/
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_BACK_1
* @stable ICU 2.4
*/
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @see U16_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_BACK_N
* @stable ICU 2.4
*/
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start start of string
* @param i string offset, must be start<i
* @param n number of code points to skip
* @see U16_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U16_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, start<=i<=length
* @param length int32_t string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
#endif

View File

@ -0,0 +1,881 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf8.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 8-bit Unicode handling macros
*
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF8_H__
#define __UTF8_H__
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
(U8_IS_LEAD(leadByte) ? \
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
* @internal
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* Lead byte E0..EF bits 3..0 are used as byte index,
* first trail byte bits 7..5 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD3_AND_T1
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* First trail byte bits 7..4 are used as byte index,
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD4_AND_T1
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
/**
* Function for handling "append code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
/**
* Function for handling "previous code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
/**
* Function for handling "skip backward one code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
#define U8_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xd7ff ? 3 : \
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
) \
)
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
* @return 4
* @stable ICU 2.4
*/
#define U8_MAX_LENGTH 4
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_GET
* @stable ICU 2.4
*/
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_unsafe_index=(int32_t)(i); \
U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to a negative value.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_GET_UNSAFE
* @stable ICU 2.4
*/
#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_GET() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_GET
* @stable ICU 51
*/
#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_NEXT
* @stable ICU 2.4
*/
#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_NEXT() if that distinction is important.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_NEXT
* @stable ICU 51
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
/** @internal */
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t = 0; \
if((i)!=(length) && \
/* fetch/validate/assemble all but last trail byte */ \
((c)>=0xe0 ? \
((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
(__t&=0x3f, 1) \
: /* U+10000..U+10FFFF */ \
((c)-=0xf0)<=4 && \
U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
(__t=(s)[i]-0x80)<=0x3f) && \
/* valid second-to-last trail byte */ \
((c)=((c)<<6)|__t, ++(i)!=(length)) \
: /* U+0080..U+07FF */ \
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
/* last trail byte */ \
(__t=(s)[i]-0x80)<=0x3f && \
((c)=((c)<<6)|__t, ++(i), 1)) { \
} else { \
(c)=(sub); /* ill-formed*/ \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const uint8_t * string buffer
* @param i string offset
* @param c code point to append
* @see U8_APPEND
* @stable ICU 2.4
*/
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else { \
if(__uc<=0x7ff) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
} else { \
if(__uc<=0xffff) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
} else { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a non-ASCII code point is written, checks for sufficient space in the string.
* If the code point is not valid or trail bytes do not fit,
* then isError is set to TRUE.
*
* @param s const uint8_t * string buffer
* @param i int32_t string offset, must be i<capacity
* @param capacity int32_t size of the string buffer
* @param c UChar32 code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U8_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_FWD_1
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @see U8_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_FWD_N
* @stable ICU 2.4
*/
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U8_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U8_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_START
* @stable ICU 2.4
*/
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
* @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* If the string ends with a UTF-8 byte sequence that is valid so far
* but incomplete, then reduce the length of the string to end before
* the lead byte of that incomplete sequence.
* For example, if the string ends with E1 80, the length is reduced by 2.
*
* In all other cases (the string ends with a complete sequence, or it is not
* possible for any further trail byte to extend the trailing sequence)
* the length remains unchanged.
*
* Useful for processing text split across multiple buffers
* (save the incomplete sequence for later)
* and for optimizing iteration
* (check for string length only once per character).
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_SET_CP_START(), this macro never reads s[length].
*
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param length int32_t string length (usually start<=length)
* @see U8_SET_CP_START
* @stable ICU 61
*/
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
if((length)>(start)) { \
uint8_t __b1=s[(length)-1]; \
if(U8_IS_SINGLE(__b1)) { \
/* common ASCII character */ \
} else if(U8_IS_LEAD(__b1)) { \
--(length); \
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
uint8_t __b2=s[(length)-2]; \
if(0xe0<=__b2 && __b2<=0xf4) { \
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
(length)-=2; \
} \
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
uint8_t __b3=s[(length)-3]; \
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
(length)-=3; \
} \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_PREV
* @stable ICU 2.4
*/
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
(c)|=(UChar32)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_PREV() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_PREV
* @stable ICU 51
*/
#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_BACK_1
* @stable ICU 2.4
*/
#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[--(i)])) {} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @see U8_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_BACK_N
* @stable ICU 2.4
*/
#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t index of the start of the string
* @param i int32_t string offset, must be start<i
* @param n number of code points to skip
* @see U8_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U8_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
U8_BACK_1_UNSAFE(s, i); \
U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i<=length
* @param length int32_t string length
* @see U8_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0)) { \
U8_BACK_1(s, start, i); \
U8_FWD_1(s, i, length); \
} \
} UPRV_BLOCK_MACRO_END
#endif

View File

@ -0,0 +1,24 @@
"calloc",
"free",
"iswalnum",
"iswalpha",
"iswblank",
"iswdigit",
"iswlower",
"iswspace",
"iswupper",
"iswxdigit",
"malloc",
"memchr",
"memcmp",
"memcpy",
"memmove",
"memset",
"realloc",
"strcmp",
"strlen",
"strncat",
"strncmp",
"strncpy",
"towlower",
"towupper",

View File

@ -0,0 +1,113 @@
// This file implements a very simple allocator for external scanners running
// in WASM. Allocation is just bumping a static pointer and growing the heap
// as needed, and freeing is mostly a noop. But in the special case of freeing
// the last-allocated pointer, we'll reuse that pointer again.
#ifdef TREE_SITTER_FEATURE_WASM
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
extern void tree_sitter_debug_message(const char *, size_t);
#define PAGESIZE 0x10000
#define MAX_HEAP_SIZE (4 * 1024 * 1024)
typedef struct {
size_t size;
char data[0];
} Region;
static Region *heap_end = NULL;
static Region *heap_start = NULL;
static Region *next = NULL;
// Get the region metadata for the given heap pointer.
static inline Region *region_for_ptr(void *ptr) {
return ((Region *)ptr) - 1;
}
// Get the location of the next region after the given region,
// if the given region had the given size.
static inline Region *region_after(Region *self, size_t len) {
char *address = self->data + len;
char *aligned = (char *)((uintptr_t)(address + 3) & ~0x3);
return (Region *)aligned;
}
static void *get_heap_end() {
return (void *)(__builtin_wasm_memory_size(0) * PAGESIZE);
}
static int grow_heap(size_t size) {
size_t new_page_count = ((size - 1) / PAGESIZE) + 1;
return __builtin_wasm_memory_grow(0, new_page_count) != SIZE_MAX;
}
// Clear out the heap, and move it to the given address.
void reset_heap(void *new_heap_start) {
heap_start = new_heap_start;
next = new_heap_start;
heap_end = get_heap_end();
}
void *malloc(size_t size) {
Region *region_end = region_after(next, size);
if (region_end > heap_end) {
if ((char *)region_end - (char *)heap_start > MAX_HEAP_SIZE) {
return NULL;
}
if (!grow_heap(size)) return NULL;
heap_end = get_heap_end();
}
void *result = &next->data;
next->size = size;
next = region_end;
return result;
}
void free(void *ptr) {
if (ptr == NULL) return;
Region *region = region_for_ptr(ptr);
Region *region_end = region_after(region, region->size);
// When freeing the last allocated pointer, re-use that
// pointer for the next allocation.
if (region_end == next) {
next = region;
}
}
void *calloc(size_t count, size_t size) {
void *result = malloc(count * size);
memset(result, 0, count * size);
return result;
}
void *realloc(void *ptr, size_t new_size) {
if (ptr == NULL) {
return malloc(new_size);
}
Region *region = region_for_ptr(ptr);
Region *region_end = region_after(region, region->size);
// When reallocating the last allocated region, return
// the same pointer, and skip copying the data.
if (region_end == next) {
next = region;
return malloc(new_size);
}
void *result = malloc(new_size);
memcpy(result, &region->data, region->size);
return result;
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
#ifndef TREE_SITTER_WASM_H_
#define TREE_SITTER_WASM_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "tree_sitter/api.h"
#include "./parser.h"
bool ts_wasm_store_start(TSWasmStore *self, TSLexer *lexer, const TSLanguage *language);
void ts_wasm_store_reset(TSWasmStore *self);
bool ts_wasm_store_has_error(const TSWasmStore *self);
bool ts_wasm_store_call_lex_main(TSWasmStore *self, TSStateId state);
bool ts_wasm_store_call_lex_keyword(TSWasmStore *self, TSStateId state);
uint32_t ts_wasm_store_call_scanner_create(TSWasmStore *self);
void ts_wasm_store_call_scanner_destroy(TSWasmStore *self, uint32_t scanner_address);
bool ts_wasm_store_call_scanner_scan(TSWasmStore *self, uint32_t scanner_address, uint32_t valid_tokens_ix);
uint32_t ts_wasm_store_call_scanner_serialize(TSWasmStore *self, uint32_t scanner_address, char *buffer);
void ts_wasm_store_call_scanner_deserialize(TSWasmStore *self, uint32_t scanner, const char *buffer, unsigned length);
void ts_wasm_language_retain(const TSLanguage *self);
void ts_wasm_language_release(const TSLanguage *self);
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_WASM_H_

View File

@ -27,9 +27,9 @@ commands = {
.out = "*compilation*",
.footer_panel = true,
.save_dirty_files = true,
.win = "code\\bin\\build.bat",
.linux = "./code/bin/package.sh",
.mac = "code/bin/package-mac.sh", },
.win = "bash build_new\\scripts\\build.sh",
.linux = "build_new/scripts/build.sh",
.mac = "build_new/scripts/build.sh", },
.run = { .out = "*run*", .footer_panel = false, .save_dirty_files = false,
.win = "build\\4ed.exe",
.linux = "build/4ed",