396 lines
9.0 KiB
C
396 lines
9.0 KiB
C
|
|
// TOP
|
|
|
|
#ifndef FCPP_LEXER_TYPES_INC
|
|
#define FCPP_LEXER_TYPES_INC
|
|
|
|
#ifndef ENUM
|
|
#define ENUM(type,name) typedef type name; enum name##_
|
|
#endif
|
|
|
|
#ifndef INTERNAL_ENUM
|
|
#define INTERNAL_ENUM(type,name) typedef type name; enum name##_
|
|
#endif
|
|
|
|
/* DOC(A Cpp_Token_Type classifies a token to make parsing easier. Some types are not
|
|
actually output by the lexer, but exist because parsers will also make use of token
|
|
types in their own output.) */
|
|
ENUM(uint32_t, Cpp_Token_Type){
|
|
CPP_TOKEN_JUNK,
|
|
CPP_TOKEN_COMMENT,
|
|
|
|
CPP_PP_INCLUDE,
|
|
CPP_PP_DEFINE,
|
|
CPP_PP_UNDEF,
|
|
CPP_PP_IF,
|
|
CPP_PP_IFDEF,
|
|
CPP_PP_IFNDEF,
|
|
CPP_PP_ELSE,
|
|
CPP_PP_ELIF,
|
|
CPP_PP_ENDIF,
|
|
CPP_PP_ERROR,
|
|
CPP_PP_IMPORT,
|
|
CPP_PP_USING,
|
|
CPP_PP_LINE,
|
|
CPP_PP_PRAGMA,
|
|
CPP_PP_STRINGIFY,
|
|
CPP_PP_CONCAT,
|
|
CPP_PP_UNKNOWN,
|
|
|
|
CPP_TOKEN_KEY_TYPE,
|
|
CPP_TOKEN_KEY_MODIFIER,
|
|
CPP_TOKEN_KEY_QUALIFIER,
|
|
/* DOC(This type is not stored in token output from the lexer.) */
|
|
CPP_TOKEN_KEY_OPERATOR,
|
|
CPP_TOKEN_KEY_CONTROL_FLOW,
|
|
CPP_TOKEN_KEY_CAST,
|
|
CPP_TOKEN_KEY_TYPE_DECLARATION,
|
|
CPP_TOKEN_KEY_ACCESS,
|
|
CPP_TOKEN_KEY_LINKAGE,
|
|
CPP_TOKEN_KEY_OTHER,
|
|
|
|
CPP_TOKEN_IDENTIFIER,
|
|
CPP_TOKEN_INTEGER_CONSTANT,
|
|
CPP_TOKEN_CHARACTER_CONSTANT,
|
|
CPP_TOKEN_FLOATING_CONSTANT,
|
|
CPP_TOKEN_STRING_CONSTANT,
|
|
CPP_TOKEN_BOOLEAN_CONSTANT,
|
|
|
|
CPP_TOKEN_STATIC_ASSERT,
|
|
|
|
CPP_TOKEN_BRACKET_OPEN,
|
|
CPP_TOKEN_BRACKET_CLOSE,
|
|
CPP_TOKEN_PARENTHESE_OPEN,
|
|
CPP_TOKEN_PARENTHESE_CLOSE,
|
|
CPP_TOKEN_BRACE_OPEN,
|
|
CPP_TOKEN_BRACE_CLOSE,
|
|
CPP_TOKEN_SEMICOLON,
|
|
CPP_TOKEN_ELLIPSIS,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_STAR,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_AMPERSAND,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_TILDE,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_PLUS,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_MINUS,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_INCREMENT,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_DECREMENT,
|
|
|
|
// NOTE(allen): Precedence 1, LtoR
|
|
CPP_TOKEN_SCOPE,
|
|
|
|
// NOTE(allen): Precedence 2, LtoR
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSTINC,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSTDEC,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_FUNC_STYLE_CAST,
|
|
CPP_TOKEN_CPP_STYLE_CAST,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_CALL,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_INDEX,
|
|
CPP_TOKEN_DOT,
|
|
CPP_TOKEN_ARROW,
|
|
|
|
// NOTE(allen): Precedence 3, RtoL
|
|
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_PREINC,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_PREDEC,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSITIVE,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_NEGAITVE,
|
|
CPP_TOKEN_NOT,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_BIT_NOT,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_CAST,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_DEREF,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_TYPE_PTR,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_ADDRESS,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_TYPE_REF,
|
|
CPP_TOKEN_SIZEOF,
|
|
CPP_TOKEN_ALIGNOF,
|
|
CPP_TOKEN_DECLTYPE,
|
|
CPP_TOKEN_TYPEID,
|
|
CPP_TOKEN_NEW,
|
|
CPP_TOKEN_DELETE,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_NEW_ARRAY,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_DELETE_ARRAY,
|
|
|
|
// NOTE(allen): Precedence 4, LtoR
|
|
CPP_TOKEN_PTRDOT,
|
|
CPP_TOKEN_PTRARROW,
|
|
|
|
// NOTE(allen): Precedence 5, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_MUL,
|
|
CPP_TOKEN_DIV,
|
|
CPP_TOKEN_MOD,
|
|
|
|
// NOTE(allen): Precedence 6, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_ADD,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_SUB,
|
|
|
|
// NOTE(allen): Precedence 7, LtoR
|
|
CPP_TOKEN_LSHIFT,
|
|
CPP_TOKEN_RSHIFT,
|
|
|
|
// NOTE(allen): Precedence 8, LtoR
|
|
CPP_TOKEN_LESS,
|
|
CPP_TOKEN_GRTR,
|
|
CPP_TOKEN_GRTREQ,
|
|
CPP_TOKEN_LESSEQ,
|
|
|
|
// NOTE(allen): Precedence 9, LtoR
|
|
CPP_TOKEN_EQEQ,
|
|
CPP_TOKEN_NOTEQ,
|
|
|
|
// NOTE(allen): Precedence 10, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_BIT_AND,
|
|
|
|
// NOTE(allen): Precedence 11, LtoR
|
|
CPP_TOKEN_BIT_XOR,
|
|
|
|
// NOTE(allen): Precedence 12, LtoR
|
|
CPP_TOKEN_BIT_OR,
|
|
|
|
// NOTE(allen): Precedence 13, LtoR
|
|
CPP_TOKEN_AND,
|
|
|
|
// NOTE(allen): Precedence 14, LtoR
|
|
CPP_TOKEN_OR,
|
|
|
|
// NOTE(allen): Precedence 15, RtoL
|
|
CPP_TOKEN_TERNARY_QMARK,
|
|
CPP_TOKEN_COLON,
|
|
CPP_TOKEN_THROW,
|
|
CPP_TOKEN_EQ,
|
|
CPP_TOKEN_ADDEQ,
|
|
CPP_TOKEN_SUBEQ,
|
|
CPP_TOKEN_MULEQ,
|
|
CPP_TOKEN_DIVEQ,
|
|
CPP_TOKEN_MODEQ,
|
|
CPP_TOKEN_LSHIFTEQ,
|
|
CPP_TOKEN_RSHIFTEQ,
|
|
CPP_TOKEN_ANDEQ,
|
|
CPP_TOKEN_OREQ,
|
|
CPP_TOKEN_XOREQ,
|
|
|
|
// NOTE(allen): Precedence 16, LtoR
|
|
CPP_TOKEN_COMMA,
|
|
|
|
CPP_TOKEN_DEFINED,
|
|
CPP_TOKEN_INCLUDE_FILE,
|
|
CPP_TOKEN_ERROR_MESSAGE,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_EOF,
|
|
|
|
CPP_TOKEN_TYPE_COUNT
|
|
};
|
|
|
|
struct Cpp_Token{
|
|
Cpp_Token_Type type;
|
|
int32_t start, size;
|
|
uint16_t state_flags;
|
|
uint16_t flags;
|
|
};
|
|
|
|
ENUM(uint16_t, Cpp_Token_Flag){
|
|
CPP_TFLAG_IGNORE = 0x1,
|
|
CPP_TFLAG_PP_DIRECTIVE = 0x2,
|
|
CPP_TFLAG_PP_BODY = 0x4,
|
|
CPP_TFLAG_BAD_ENDING = 0x8,
|
|
CPP_TFLAG_MULTILINE = 0x10,
|
|
CPP_TFLAG_PARAMETERIZED = 0x20,
|
|
CPP_TFLAG_IS_OPERATOR = 0x40,
|
|
CPP_TFLAG_IS_KEYWORD = 0x80
|
|
};
|
|
|
|
ENUM(uint16_t, Cpp_Preprocessor_State){
|
|
CPP_LEX_PP_DEFAULT,
|
|
CPP_LEX_PP_IDENTIFIER,
|
|
CPP_LEX_PP_MACRO_IDENTIFIER,
|
|
CPP_LEX_PP_INCLUDE,
|
|
CPP_LEX_PP_BODY,
|
|
CPP_LEX_PP_BODY_IF,
|
|
CPP_LEX_PP_NUMBER,
|
|
CPP_LEX_PP_ERROR,
|
|
CPP_LEX_PP_JUNK,
|
|
CPP_LEX_PP_COUNT
|
|
};
|
|
|
|
struct Cpp_Token_Array{
|
|
Cpp_Token *tokens;
|
|
int32_t count, max_count;
|
|
};
|
|
static Cpp_Token_Array null_cpp_token_array = {0};
|
|
|
|
struct Cpp_Get_Token_Result{
|
|
int32_t token_index;
|
|
int32_t in_whitespace;
|
|
};
|
|
|
|
struct Cpp_Relex_State{
|
|
char *data;
|
|
int32_t size;
|
|
|
|
Cpp_Token_Array *array;
|
|
int32_t start, end, amount;
|
|
int32_t start_token_i;
|
|
int32_t end_token_i;
|
|
int32_t relex_start;
|
|
int32_t tolerance;
|
|
int32_t space_request;
|
|
};
|
|
|
|
struct Cpp_Lex_FSM{
|
|
uint8_t state;
|
|
uint8_t int_state;
|
|
uint8_t emit_token;
|
|
uint8_t multi_line;
|
|
};
|
|
static Cpp_Lex_FSM null_lex_fsm = {0};
|
|
|
|
struct Cpp_Lex_Data{
|
|
char *tb;
|
|
int32_t tb_pos;
|
|
int32_t token_start;
|
|
|
|
int32_t pos;
|
|
int32_t pos_overide;
|
|
int32_t chunk_pos;
|
|
|
|
Cpp_Lex_FSM fsm;
|
|
uint8_t white_done;
|
|
uint8_t pp_state;
|
|
uint8_t completed;
|
|
|
|
Cpp_Token token;
|
|
|
|
int32_t __pc__;
|
|
};
|
|
|
|
ENUM(int32_t, Cpp_Lex_Result){
|
|
LexResult_Finished,
|
|
LexResult_NeedChunk,
|
|
LexResult_NeedTokenMemory,
|
|
LexResult_HitTokenLimit,
|
|
};
|
|
|
|
INTERNAL_ENUM(uint8_t, Cpp_Lex_State){
|
|
LS_default,
|
|
LS_identifier,
|
|
LS_pound,
|
|
LS_pp,
|
|
LS_ppdef,
|
|
LS_char,
|
|
LS_char_multiline,
|
|
LS_char_slashed,
|
|
LS_string,
|
|
LS_string_multiline,
|
|
LS_string_slashed,
|
|
LS_number,
|
|
LS_number0,
|
|
LS_float,
|
|
LS_crazy_float0,
|
|
LS_crazy_float1,
|
|
LS_hex,
|
|
LS_comment_pre,
|
|
LS_comment,
|
|
LS_comment_slashed,
|
|
LS_comment_block,
|
|
LS_comment_block_ending,
|
|
LS_dot,
|
|
LS_ellipsis,
|
|
LS_less,
|
|
LS_less_less,
|
|
LS_more,
|
|
LS_more_more,
|
|
LS_minus,
|
|
LS_arrow,
|
|
LS_and,
|
|
LS_or,
|
|
LS_plus,
|
|
LS_colon,
|
|
LS_star,
|
|
LS_modulo,
|
|
LS_caret,
|
|
LS_eq,
|
|
LS_bang,
|
|
LS_error_message,
|
|
//
|
|
LS_count
|
|
};
|
|
|
|
INTERNAL_ENUM(uint8_t, Cpp_Lex_Int_State){
|
|
LSINT_default,
|
|
LSINT_u,
|
|
LSINT_l,
|
|
LSINT_L,
|
|
LSINT_ul,
|
|
LSINT_uL,
|
|
LSINT_ll,
|
|
LSINT_extra,
|
|
//
|
|
LSINT_count
|
|
};
|
|
|
|
INTERNAL_ENUM(uint8_t, Cpp_Lex_PP_State){
|
|
LSPP_default,
|
|
LSPP_include,
|
|
LSPP_macro_identifier,
|
|
LSPP_identifier,
|
|
LSPP_body_if,
|
|
LSPP_body,
|
|
LSPP_number,
|
|
LSPP_error,
|
|
LSPP_junk,
|
|
//
|
|
LSPP_count
|
|
};
|
|
|
|
#endif
|
|
|
|
// BOTTOM
|
|
|