4coder/4cpp_lexer_types.h


// TOP

#ifndef FCPP_LEXER_TYPES_INC
#define FCPP_LEXER_TYPES_INC

#ifndef ENUM
#define ENUM(type,name) typedef type name; enum name##_
#endif

#ifndef INTERNAL_ENUM
#define INTERNAL_ENUM(type,name) typedef type name; enum name##_
#endif

/* DOC(A Cpp_Token_Type classifies a token to make parsing easier. Some types are not
actually output by the lexer, but exist because parsers will also make use of token
types in their own output.) */
ENUM(uint32_t, Cpp_Token_Type){
	CPP_TOKEN_JUNK,
	CPP_TOKEN_COMMENT,

	CPP_PP_INCLUDE,
	CPP_PP_DEFINE,
	CPP_PP_UNDEF,
	CPP_PP_IF,
	CPP_PP_IFDEF,
	CPP_PP_IFNDEF,
	CPP_PP_ELSE,
	CPP_PP_ELIF,
	CPP_PP_ENDIF,
	CPP_PP_ERROR,
	CPP_PP_IMPORT,
	CPP_PP_USING,
	CPP_PP_LINE,
	CPP_PP_PRAGMA,
	CPP_PP_STRINGIFY,
	CPP_PP_CONCAT,
	CPP_PP_UNKNOWN,

	CPP_TOKEN_KEY_TYPE,
	CPP_TOKEN_KEY_MODIFIER,
	CPP_TOKEN_KEY_QUALIFIER,
    /* DOC(This type is not stored in token output from the lexer.) */
	CPP_TOKEN_KEY_OPERATOR,
	CPP_TOKEN_KEY_CONTROL_FLOW,
    CPP_TOKEN_KEY_CAST,
	CPP_TOKEN_KEY_TYPE_DECLARATION,
	CPP_TOKEN_KEY_ACCESS,
	CPP_TOKEN_KEY_LINKAGE,
	CPP_TOKEN_KEY_OTHER,

	CPP_TOKEN_IDENTIFIER,
	CPP_TOKEN_INTEGER_CONSTANT,
	CPP_TOKEN_CHARACTER_CONSTANT,
	CPP_TOKEN_FLOATING_CONSTANT,
	CPP_TOKEN_STRING_CONSTANT,
	CPP_TOKEN_BOOLEAN_CONSTANT,

    CPP_TOKEN_STATIC_ASSERT,

	CPP_TOKEN_BRACKET_OPEN,
	CPP_TOKEN_BRACKET_CLOSE,
	CPP_TOKEN_PARENTHESE_OPEN,
	CPP_TOKEN_PARENTHESE_CLOSE,
	CPP_TOKEN_BRACE_OPEN,
	CPP_TOKEN_BRACE_CLOSE,
    CPP_TOKEN_SEMICOLON,
    CPP_TOKEN_ELLIPSIS,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_STAR,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_AMPERSAND,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_TILDE,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_PLUS,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_MINUS,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_INCREMENT,

    /* DOC(This is an 'ambiguous' token type because it requires
    parsing to determine the full nature of the token.) */
	CPP_TOKEN_DECREMENT,

    // NOTE(allen): Precedence 1, LtoR
	CPP_TOKEN_SCOPE,

    // NOTE(allen): Precedence 2, LtoR
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_POSTINC,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_POSTDEC,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_FUNC_STYLE_CAST,
    CPP_TOKEN_CPP_STYLE_CAST,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_CALL,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_INDEX,
	CPP_TOKEN_DOT,
	CPP_TOKEN_ARROW,

    // NOTE(allen): Precedence 3, RtoL

    /* DOC(This token is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_PREINC,
    /* DOC(This token is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_PREDEC,
    /* DOC(This token is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_POSITIVE,
    /* DOC(This token is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_NEGAITVE,
	CPP_TOKEN_NOT,

    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_BIT_NOT,

    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_CAST,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_DEREF,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_TYPE_PTR,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_ADDRESS,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_TYPE_REF,
    CPP_TOKEN_SIZEOF,
    CPP_TOKEN_ALIGNOF,
    CPP_TOKEN_DECLTYPE,
    CPP_TOKEN_TYPEID,
    CPP_TOKEN_NEW,
    CPP_TOKEN_DELETE,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_NEW_ARRAY,
    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_DELETE_ARRAY,

    // NOTE(allen): Precedence 4, LtoR
	CPP_TOKEN_PTRDOT,
	CPP_TOKEN_PTRARROW,

    // NOTE(allen): Precedence 5, LtoR

    /* DOC(This type is for parser use, it is not output by the lexer.) */
	CPP_TOKEN_MUL,
	CPP_TOKEN_DIV,
	CPP_TOKEN_MOD,

    // NOTE(allen): Precedence 6, LtoR

    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_ADD,

    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_SUB,

    // NOTE(allen): Precedence 7, LtoR
	CPP_TOKEN_LSHIFT,
	CPP_TOKEN_RSHIFT,

    // NOTE(allen): Precedence 8, LtoR
	CPP_TOKEN_LESS,
	CPP_TOKEN_GRTR,
	CPP_TOKEN_GRTREQ,
	CPP_TOKEN_LESSEQ,

    // NOTE(allen): Precedence 9, LtoR
    CPP_TOKEN_EQEQ,
    CPP_TOKEN_NOTEQ,

    // NOTE(allen): Precedence 10, LtoR

    /* DOC(This type is for parser use, it is not output by the lexer.) */
	CPP_TOKEN_BIT_AND,

    // NOTE(allen): Precedence 11, LtoR
	CPP_TOKEN_BIT_XOR,

    // NOTE(allen): Precedence 12, LtoR
	CPP_TOKEN_BIT_OR,

    // NOTE(allen): Precedence 13, LtoR
	CPP_TOKEN_AND,

    // NOTE(allen): Precedence 14, LtoR
	CPP_TOKEN_OR,

    // NOTE(allen): Precedence 15, RtoL
    CPP_TOKEN_TERNARY_QMARK,
	CPP_TOKEN_COLON,
    CPP_TOKEN_THROW,
	CPP_TOKEN_EQ,
	CPP_TOKEN_ADDEQ,
	CPP_TOKEN_SUBEQ,
	CPP_TOKEN_MULEQ,
	CPP_TOKEN_DIVEQ,
	CPP_TOKEN_MODEQ,
	CPP_TOKEN_LSHIFTEQ,
	CPP_TOKEN_RSHIFTEQ,
	CPP_TOKEN_ANDEQ,
	CPP_TOKEN_OREQ,
	CPP_TOKEN_XOREQ,

    // NOTE(allen): Precedence 16, LtoR
	CPP_TOKEN_COMMA,

    CPP_TOKEN_DEFINED,
	CPP_TOKEN_INCLUDE_FILE,
    CPP_TOKEN_ERROR_MESSAGE,

    /* DOC(This type is for parser use, it is not output by the lexer.) */
    CPP_TOKEN_EOF,

    CPP_TOKEN_TYPE_COUNT
};

struct Cpp_Token{
	Cpp_Token_Type type;
    int32_t start, size;
    uint16_t state_flags;
    uint16_t flags;
};

ENUM(uint16_t, Cpp_Token_Flag){
	CPP_TFLAG_IGNORE = 0x1,
	CPP_TFLAG_PP_DIRECTIVE = 0x2,
	CPP_TFLAG_PP_BODY = 0x4,
	CPP_TFLAG_BAD_ENDING = 0x8,
	CPP_TFLAG_MULTILINE = 0x10,
    CPP_TFLAG_PARAMETERIZED = 0x20,
    CPP_TFLAG_IS_OPERATOR = 0x40,
    CPP_TFLAG_IS_KEYWORD = 0x80
};

ENUM(uint16_t, Cpp_Preprocessor_State){
	CPP_LEX_PP_DEFAULT,
	CPP_LEX_PP_IDENTIFIER,
	CPP_LEX_PP_MACRO_IDENTIFIER,
	CPP_LEX_PP_INCLUDE,
	CPP_LEX_PP_BODY,
	CPP_LEX_PP_BODY_IF,
	CPP_LEX_PP_NUMBER,
    CPP_LEX_PP_ERROR,
	CPP_LEX_PP_JUNK,
	CPP_LEX_PP_COUNT
};

struct Cpp_Token_Array{
	Cpp_Token *tokens;
	int32_t count, max_count;
};
static Cpp_Token_Array null_cpp_token_array = {0};

struct Cpp_Get_Token_Result{
	int32_t token_index;
	int32_t in_whitespace;
};

struct Cpp_Relex_State{
    char *data;
    int32_t size;

    Cpp_Token_Array *array;
    int32_t start, end, amount;
    int32_t start_token_i;
    int32_t end_token_i;
    int32_t relex_start;
    int32_t tolerance;
    int32_t space_request;
};

struct Cpp_Lex_FSM{
    uint8_t state;
    uint8_t int_state;
    uint8_t emit_token;
    uint8_t multi_line;
};
static Cpp_Lex_FSM null_lex_fsm = {0};

struct Cpp_Lex_Data{
    char *tb;
    int32_t tb_pos;
    int32_t token_start;

    int32_t pos;
    int32_t pos_overide;
    int32_t chunk_pos;

    Cpp_Lex_FSM fsm;
    uint8_t white_done;
    uint8_t pp_state;
    uint8_t completed;

    Cpp_Token token;

    int32_t __pc__;
};

ENUM(int32_t, Cpp_Lex_Result){
    LexResult_Finished,
    LexResult_NeedChunk,
    LexResult_NeedTokenMemory,
    LexResult_HitTokenLimit,
};

INTERNAL_ENUM(uint8_t, Cpp_Lex_State){
    LS_default,
    LS_identifier,
    LS_pound,
    LS_pp,
    LS_ppdef,
    LS_char,
    LS_char_multiline,
    LS_char_slashed,
    LS_string,
    LS_string_multiline,
    LS_string_slashed,
    LS_number,
    LS_number0,
    LS_float,
    LS_crazy_float0,
    LS_crazy_float1,
    LS_hex,
    LS_comment_pre,
    LS_comment,
    LS_comment_slashed,
    LS_comment_block,
    LS_comment_block_ending,
    LS_dot,
    LS_ellipsis,
    LS_less,
    LS_less_less,
    LS_more,
    LS_more_more,
    LS_minus,
    LS_arrow,
    LS_and,
    LS_or,
    LS_plus,
    LS_colon,
    LS_star,
    LS_modulo,
    LS_caret,
    LS_eq,
    LS_bang,
    LS_error_message,
    //
    LS_count
};

INTERNAL_ENUM(uint8_t, Cpp_Lex_Int_State){
	LSINT_default,
    LSINT_u,
    LSINT_l,
    LSINT_L,
    LSINT_ul,
    LSINT_uL,
    LSINT_ll,
    LSINT_extra,
    //
    LSINT_count
};

INTERNAL_ENUM(uint8_t, Cpp_Lex_PP_State){
    LSPP_default,
    LSPP_include,
    LSPP_macro_identifier,
    LSPP_identifier,
    LSPP_body_if,
    LSPP_body,
    LSPP_number,
    LSPP_error,
    LSPP_junk,
    //
    LSPP_count
};

#endif

// BOTTOM