From 3a7f4db69e515659128de5d9d592e8b64675d24e Mon Sep 17 00:00:00 2001 From: Allen Webster Date: Sun, 18 Sep 2016 10:41:15 -0400 Subject: [PATCH] relex API allows starting from any chunk, fixed null check bug --- 4coder_API.html | 8 ++-- 4cpp_lexer.h | 95 +++++++++++++++++++++++++++++++---------------- 4ed_file_view.cpp | 37 ++++++++++++++++-- 3 files changed, 101 insertions(+), 39 deletions(-) diff --git a/4coder_API.html b/4coder_API.html index 2489973c..52ceab08 100644 --- a/4coder_API.html +++ b/4coder_API.html @@ -292,7 +292,7 @@ It should point at the String in the first element of the array.
count
The count parameter specifies the number of elements in the str_set array.
str
The str parameter specifies the string to match against the str_set.
match_index
If this call succeeds match_index is filled with the index into str_set where the match occurred.
Description
This call tries to see if str matches any of the strings in str_set. If there is a match the call succeeds and returns non-zero. The matching rule is equivalent to the matching rule for match.

See Also
match

§4.3.116: string_set_match

fstr_bool string_set_match(
String *str_set,
int32_t count,
String str,
int32_t *match_index
)
Parameters
str_set
The str_set parameter is an array of String structs specifying matchable strings.
count
The count parameter specifies the number of String structs in the str_set array.
str
The str parameter specifies the string to match against the str_set.
match_index
If this call succeeds match_index is filled with the index into str_set where the match occurred.
Description
This call tries to see if str matches any of the strings in str_set. If there is a match the call succeeds and returns non-zero. The matching rule is equivalent to the matching rule for match.

See Also
match

-

§5 Lexer Library

§5.1 Lexer Intro

The 4cpp lexer system provides a polished, fast, flexible system that takes in C/C++ and outputs a tokenization of the text data. There are two API levels. One level is setup to let you easily get a tokenization of the file. This level manages memory for you with malloc to make it as fast as possible to start getting your tokens. The second level enables deep integration by allowing control over allocation, data chunking, and output rate control.

To use the quick setup API you simply include 4cpp_lexer.h and read the documentation at cpp_lex_file.

To use the the fancier API include 4cpp_lexer.h and read the documentation at cpp_lex_step. If you want to be absolutely sure you are not including malloc into your program you can define FCPP_FORBID_MALLOC before the include and the "step" API will continue to work.

There are a few more features in 4cpp that are not documented yet. You are free to try to use these, but I am not totally sure they are ready yet, and when they are they will be documented.

§5.2 Lexer Function List

§5.3 Lexer Types List

§5.4 Lexer Function Descriptions

§5.4.1: cpp_get_token

Cpp_Get_Token_Result cpp_get_token(
Cpp_Token_Array *token_array_in,
int32_t pos
)
Parameters
token_array
The array of tokens from which to get a token.
pos
The position, measured in bytes, to get the token for.
Return
A Cpp_Get_Token_Result struct is returned containing the index +

§5 Lexer Library

§5.1 Lexer Intro

The 4cpp lexer system provides a polished, fast, flexible system that takes in C/C++ and outputs a tokenization of the text data. There are two API levels. One level is setup to let you easily get a tokenization of the file. This level manages memory for you with malloc to make it as fast as possible to start getting your tokens. The second level enables deep integration by allowing control over allocation, data chunking, and output rate control.

To use the quick setup API you simply include 4cpp_lexer.h and read the documentation at cpp_lex_file.

To use the the fancier API include 4cpp_lexer.h and read the documentation at cpp_lex_step. If you want to be absolutely sure you are not including malloc into your program you can define FCPP_FORBID_MALLOC before the include and the "step" API will continue to work.

There are a few more features in 4cpp that are not documented yet. You are free to try to use these, but I am not totally sure they are ready yet, and when they are they will be documented.

§5.2 Lexer Function List

§5.3 Lexer Types List

§5.4 Lexer Function Descriptions

§5.4.1: cpp_get_token

Cpp_Get_Token_Result cpp_get_token(
Cpp_Token_Array *token_array_in,
int32_t pos
)
Parameters
token_array
The array of tokens from which to get a token.
pos
The position, measured in bytes, to get the token for.
Return
A Cpp_Get_Token_Result struct is returned containing the index of a token and a flag indicating whether the pos is contained in the token or in whitespace after the token.
Description
This call performs a binary search over all of the tokens looking for the token that contains the specified position. If the position @@ -339,11 +339,11 @@ to free the temp buffer that was originally used to make the lex state. This cal a new temp buffer when you are ready to resume lexing.

However the new buffer needs to have the same contents the old buffer had. To ensure this you have to use cpp_lex_data_temp_size and cpp_lex_data_temp_read to get the relevant contents of the temp buffer -before you free it.

See Also
cpp_lex_data_temp_size
cpp_lex_data_temp_read

§5.4.7: cpp_relex_init

Cpp_Relex_Data cpp_relex_init(
Cpp_Token_Array *array,
int32_t start_pos,
int32_t end_pos,
int32_t character_shift_amount,
char *spare
)
No documentation generated for this function.

§5.4.8: cpp_relex_step

Cpp_Lex_Result cpp_relex_step(
Cpp_Relex_Data *S_ptr,
char *chunk,
int32_t chunk_size,
int32_t full_size,
Cpp_Token_Array *array,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.9: cpp_relex_get_new_count

int32_t cpp_relex_get_new_count(
Cpp_Relex_Data *S_ptr,
int32_t current_count,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.10: cpp_relex_complete

void cpp_relex_complete(
Cpp_Relex_Data *S_ptr,
Cpp_Token_Array *array,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.11: cpp_relex_abort

void cpp_relex_abort(
Cpp_Relex_Data *S_ptr,
Cpp_Token_Array *array
)
No documentation generated for this function.

§5.4.12: cpp_make_token_array

Cpp_Token_Array cpp_make_token_array(
int32_t starting_max
)
Parameters
starting_max
The number of tokens to initialize the array with.
Return
An empty Cpp_Token_Array with memory malloc'd for storing tokens.
Description
This call allocates a Cpp_Token_Array with malloc for use in other +before you free it.

See Also
cpp_lex_data_temp_size
cpp_lex_data_temp_read

§5.4.7: cpp_relex_init

Cpp_Relex_Data cpp_relex_init(
Cpp_Token_Array *array,
int32_t start_pos,
int32_t end_pos,
int32_t character_shift_amount,
char *spare
)
No documentation generated for this function.

§5.4.8: cpp_relex_start_position

int32_t cpp_relex_start_position(
Cpp_Relex_Data *S_ptr
)
No documentation generated for this function.

§5.4.9: cpp_relex_declare_first_chunk_position

void cpp_relex_declare_first_chunk_position(
Cpp_Relex_Data *S_ptr,
int32_t position
)
No documentation generated for this function.

§5.4.10: cpp_relex_is_start_chunk

int32_t cpp_relex_is_start_chunk(
Cpp_Relex_Data *S_ptr,
char *chunk,
int32_t chunk_size
)
No documentation generated for this function.

§5.4.11: cpp_relex_step

Cpp_Lex_Result cpp_relex_step(
Cpp_Relex_Data *S_ptr,
char *chunk,
int32_t chunk_size,
int32_t full_size,
Cpp_Token_Array *array,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.12: cpp_relex_get_new_count

int32_t cpp_relex_get_new_count(
Cpp_Relex_Data *S_ptr,
int32_t current_count,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.13: cpp_relex_complete

void cpp_relex_complete(
Cpp_Relex_Data *S_ptr,
Cpp_Token_Array *array,
Cpp_Token_Array *relex_array
)
No documentation generated for this function.

§5.4.14: cpp_relex_abort

void cpp_relex_abort(
Cpp_Relex_Data *S_ptr,
Cpp_Token_Array *array
)
No documentation generated for this function.

§5.4.15: cpp_make_token_array

Cpp_Token_Array cpp_make_token_array(
int32_t starting_max
)
Parameters
starting_max
The number of tokens to initialize the array with.
Return
An empty Cpp_Token_Array with memory malloc'd for storing tokens.
Description
This call allocates a Cpp_Token_Array with malloc for use in other convenience functions. Stacks that are not allocated this way should not be -used in the convenience functions.


§5.4.13: cpp_free_token_array

void cpp_free_token_array(
Cpp_Token_Array token_array
)
Parameters
token_array
An array previously allocated by cpp_make_token_array
Description
This call frees a Cpp_Token_Array.

See Also
cpp_make_token_array

§5.4.14: cpp_resize_token_array

void cpp_resize_token_array(
Cpp_Token_Array *token_array,
int32_t new_max
)
Parameters
token_array
An array previously allocated by cpp_make_token_array.
new_max
The new maximum size the array should support. If this is not greater +used in the convenience functions.


§5.4.16: cpp_free_token_array

void cpp_free_token_array(
Cpp_Token_Array token_array
)
Parameters
token_array
An array previously allocated by cpp_make_token_array
Description
This call frees a Cpp_Token_Array.

See Also

§5.4.17: cpp_resize_token_array

void cpp_resize_token_array(
Cpp_Token_Array *token_array,
int32_t new_max
)
Parameters
token_array
An array previously allocated by cpp_make_token_array.
new_max
The new maximum size the array should support. If this is not greater than the current size of the array the operation is ignored.
Description
This call allocates a new memory chunk and moves the existing tokens in the array -over to the new chunk.

See Also

§5.4.15: cpp_lex_file

void cpp_lex_file(
char *data,
int32_t size,
Cpp_Token_Array *token_array_out
)
Parameters
data
The file data to be lexed in a single contiguous block.
size
The number of bytes in data.
token_array_out
The token array where the output tokens will be pushed. +over to the new chunk.

See Also

§5.4.18: cpp_lex_file

void cpp_lex_file(
char *data,
int32_t size,
Cpp_Token_Array *token_array_out
)
Parameters
data
The file data to be lexed in a single contiguous block.
size
The number of bytes in data.
token_array_out
The token array where the output tokens will be pushed. This token array must be previously allocated with cpp_make_token_array
Description
Lexes an entire file and manages the interaction with the lexer system so that it is quick and convenient to lex files.



Cpp_Token_Array lex_file(char *file_name){
    File_Data file = read_whole_file(file_name);
    
    // This array will be automatically grown if it runs
    // out of memory.
    Cpp_Token_Array array = cpp_make_token_array(100);
    
    cpp_lex_file(file.data, file.size, &array);
    
    return(array);
}
See Also

§5.5 Lexer Type Descriptions

§5.5.1: Cpp_Token_Type

enum Cpp_Token_Type;
Description
A Cpp_Token_Type classifies a token to make parsing easier. Some types are not diff --git a/4cpp_lexer.h b/4cpp_lexer.h index 3e102fca..89f9e51f 100644 --- a/4cpp_lexer.h +++ b/4cpp_lexer.h @@ -800,7 +800,7 @@ cpp_lex_nonalloc_null_end_no_limit(Cpp_Lex_Data *S_ptr, char *chunk, int32_t siz break; } - if (chunk[S.pos-1] == 0){ + if (S.pos > S.chunk_pos && chunk[S.pos-1] == 0){ --S.pos; } @@ -1207,6 +1207,40 @@ cpp_relex_init(Cpp_Token_Array *array, int32_t start_pos, int32_t end_pos, int32 return(state); } +FCPP_LINK int32_t +cpp_relex_start_position(Cpp_Relex_Data *S_ptr){ + int32_t result = S_ptr->relex_start_position; + return(result); +} + +FCPP_LINK void +cpp_relex_declare_first_chunk_position(Cpp_Relex_Data *S_ptr, int32_t position){ + S_ptr->lex.chunk_pos = position; +} + +FCPP_LINK int32_t +cpp_relex_is_start_chunk(Cpp_Relex_Data *S_ptr, char *chunk, int32_t chunk_size){ + int32_t pos = S_ptr->relex_start_position; + int32_t start = S_ptr->lex.chunk_pos; + int32_t end = start + chunk_size; + + int32_t good_chunk = 0; + if (start <= pos && pos < end){ + good_chunk = 1; + } + else{ + if (chunk == 0){ + good_chunk = 1; + S_ptr->lex.chunk_pos = pos; + } + else{ + S_ptr->lex.chunk_pos += chunk_size; + } + } + + return(good_chunk); +} + // duff-routine defines #define DrCase(PC) case PC: goto resumespot_##PC @@ -1232,41 +1266,40 @@ cpp_relex_step(Cpp_Relex_Data *S_ptr, char *chunk, int32_t chunk_size, int32_t f cpp_shift_token_starts(array, S.end_token_index, S.character_shift_amount); S.end_token = cpp_index_array(array, full_size, S.end_token_index); - if (S.relex_start_position < full_size){ - // TODO(allen): This can be better I suspect. - for (;;){ - Cpp_Lex_Result step_result = - cpp_lex_nonalloc_no_null_out_limit(&S.lex, chunk, chunk_size, full_size, - relex_array, 1); - - switch (step_result){ - case LexResult_HitTokenLimit: - { - Cpp_Token token = relex_array->tokens[relex_array->count-1]; - if (token.type == S.end_token.type && - token.start == S.end_token.start && - token.size == S.end_token.size && - token.flags == S.end_token.flags && - token.state_flags == S.end_token.state_flags){ - --relex_array->count; - goto double_break; - } - - while (S.lex.pos > S.end_token.start && S.end_token_index < array->count){ - ++S.end_token_index; - S.end_token = cpp_index_array(array, full_size, S.end_token_index); - } + // TODO(allen): This can be better I suspect. + for (;;){ + Cpp_Lex_Result step_result = + cpp_lex_nonalloc_no_null_out_limit(&S.lex, chunk, chunk_size, full_size, + relex_array, 1); + + switch (step_result){ + case LexResult_HitTokenLimit: + { + Cpp_Token token = relex_array->tokens[relex_array->count-1]; + if (token.type == S.end_token.type && + token.start == S.end_token.start && + token.size == S.end_token.size && + token.flags == S.end_token.flags && + token.state_flags == S.end_token.state_flags){ + --relex_array->count; + goto double_break; } - break; - case LexResult_NeedChunk: DrYield(1, LexResult_NeedChunk); break; - - case LexResult_NeedTokenMemory: DrYield(2, LexResult_NeedTokenMemory); break; - - case LexResult_Finished: goto double_break; + while (S.lex.pos > S.end_token.start && S.end_token_index < array->count){ + ++S.end_token_index; + S.end_token = cpp_index_array(array, full_size, S.end_token_index); + } } + break; + + case LexResult_NeedChunk: DrYield(1, LexResult_NeedChunk); break; + + case LexResult_NeedTokenMemory: DrYield(2, LexResult_NeedTokenMemory); break; + + case LexResult_Finished: goto double_break; } } + double_break:; DrReturn(LexResult_Finished); diff --git a/4ed_file_view.cpp b/4ed_file_view.cpp index 10b49062..d86323ff 100644 --- a/4ed_file_view.cpp +++ b/4ed_file_view.cpp @@ -1276,20 +1276,49 @@ file_relex_parallel(System_Functions *system, relex_array.tokens = push_array(part, Cpp_Token, relex_array.max_count); i32 size = file->state.buffer.size; - char *spare = push_array(part, char, size); + char *spare = push_array(part, char, size+1); Cpp_Relex_Data state = cpp_relex_init(array, start_i, end_i, shift_amount, spare); char *chunk = file->state.buffer.data; - i32 chunk_size = size; + i32 chunk_size = 1024; + i32 chunk_index = 0; + + int32_t start_position = cpp_relex_start_position(&state); + + if (start_position == size){ + chunk = 0; + chunk_size = 0; + cpp_relex_declare_first_chunk_position(&state, size); + } + else{ + chunk_index = start_position / chunk_size; + + int32_t chunk_start_position = chunk_index*1024; + if (chunk_start_position + chunk_size > size){ + chunk_size = size - chunk_start_position; + } + + cpp_relex_declare_first_chunk_position(&state, chunk_start_position); + + chunk += chunk_start_position; + } + for(;;){ Cpp_Lex_Result lex_result = cpp_relex_step(&state, chunk, chunk_size, size, array, &relex_array); switch (lex_result){ case LexResult_NeedChunk: - Assert(!"There is only one chunk in the current system."); - break; + { + ++chunk_index; + chunk += chunk_size; + + int32_t chunk_start_position = chunk_index*1024; + if (chunk_start_position + chunk_size > size){ + chunk_size = size - chunk_start_position; + } + }break; case LexResult_NeedTokenMemory: inline_lex = 0;