/* 4coder_lex_gen_main.cpp - A generator for language lexers. */ // TOP #if !defined(LANG_NAME_LOWER) || !defined(LANG_NAME_CAMEL) #error 4coder_lex_get_main.cpp not correctly included. #endif #include "4coder_base_types.h" #include "4coder_table.h" #include "4coder_token.h" #include "4coder_base_types.cpp" #include "4coder_stringf.cpp" #include "4coder_malloc_allocator.cpp" #include "4coder_table.cpp" //////////////////////////////// struct Keyword{ Keyword *next; String_Const_u8 name; String_Const_u8 lexeme; Token_Base_Kind base_kind; }; struct Keyword_Set{ Keyword *first; Keyword *last; i32 count; }; struct Character_Name_Set{ Table_u64_Data char_to_name; }; struct Operator{ Operator *next; String_Const_u8 op; String_Const_u8 name; Token_Base_Kind base_kind; }; struct Operator_Set{ Operator *first; Operator *last; i32 count; }; struct Direct_Kind{ Direct_Kind *next; String_Const_u8 name; Token_Base_Kind base_kind; }; //////////////////////////////// int main(void){ // utf-8 bytes u8 utf8[129]; for (u16 i = 0; i < 128; i += 1){ utf8[i] = i + 128; } utf8[128] = 0; // Base Names char_name('{', "BraceOp"); char_name('}', "BraceCl"); char_name('(', "ParenOp"); char_name(')', "ParenCl"); char_name('[', "BrackOp"); char_name(']', "BrackCl"); char_name('-', "Minus"); char_name('+', "Plus"); char_name('.', "Dot"); char_name('!', "Bang"); char_name('*', "Star"); char_name(',', "Comma"); char_name(':', "Colon"); char_name(';', "Semicolon"); char_name('@', "At"); char_name('#', "Pound"); char_name('$', "Dollar"); char_name('%', "Percent"); char_name('^', "Carrot"); char_name('&', "And"); char_name('=', "Eq"); char_name('<', "Less"); char_name('>', "Grtr"); char_name('~', "Tilde"); char_name('/', "Slash"); char_name('?', "Question"); char_name('|', "Pipe"); // CPP Names char_name('!', "Not"); char_name('|', "Or"); char_name('%', "Mod"); char_name('^', "Xor"); char_name('?', "Ternary"); // Typical Base Token Kinds select_base_kind(TokenBaseKind_EOF); direct_token_kind("EOF"); select_base_kind(TokenBaseKind_Whitespace); direct_token_kind("Whitespace"); select_base_kind(TokenBaseKind_LexError); direct_token_kind("LexError"); // CPP Direct Toke Kinds select_base_kind(TokenBaseKind_Comment); direct_token_kind("BlockComment"); direct_token_kind("LineComment"); select_base_kind(TokenBaseKind_Whitespace); direct_token_kind("Backslash"); select_base_kind(TokenBaseKind_LiteralInteger); direct_token_kind("LiteralInteger"): direct_token_kind("LiteralIntegerU"): direct_token_kind("LiteralIntegerL"): direct_token_kind("LiteralIntegerUL"): direct_token_kind("LiteralIntegerLL"): direct_token_kind("LiteralIntegerULL"): direct_token_kind("LiteralIntegerHex"): direct_token_kind("LiteralIntegerHexU"): direct_token_kind("LiteralIntegerHexL"): direct_token_kind("LiteralIntegerHexUL"): direct_token_kind("LiteralIntegerHexLL"): direct_token_kind("LiteralIntegerHexULL"): direct_token_kind("LiteralIntegerOct"): direct_token_kind("LiteralIntegerOctU"): direct_token_kind("LiteralIntegerOctL"): direct_token_kind("LiteralIntegerOctUL"): direct_token_kind("LiteralIntegerOctLL"): direct_token_kind("LiteralIntegerOctULL"): select_base_kind(TokenBaseKind_LiteralFloat); direct_token_kind("LiteralFloat32"): direct_token_kind("LiteralFloat64"): select_base_kind(TokenBaseKind_LiteralString); direct_token_kind("LiteralString"): direct_token_kind("LiteralStringWide"): direct_token_kind("LiteralStringUTF8"): direct_token_kind("LiteralStringUTF16"): direct_token_kind("LiteralStringUTF32"): direct_token_kind("LiteralStringRaw"): direct_token_kind("LiteralStringWideRaw"): direct_token_kind("LiteralStringUTF8Raw"): direct_token_kind("LiteralStringUTF16Raw"): direct_token_kind("LiteralStringUTF32Raw"): direct_token_kind("LiteralCharacter"): direct_token_kind("LiteralCharacterWide"): direct_token_kind("LiteralCharacterUTF8"): direct_token_kind("LiteralCharacterUTF16"): direct_token_kind("LiteralCharacterUTF32"): direct_token_kind("PPIncludeFile"); // CPP Operators Operator_Set *main_ops = begin_op_set(); select_base_kind(TokenBaseKind_ScopeOpen); op("{"); select_base_kind(TokenBaseKind_ScopeClose); op("}"); select_base_kind(TokenBaseKind_ParentheticalOpen); op("("); op("["); select_base_kind(TokenBaseKind_ParentheticalClose); op(")"); op("]"); select_base_kind(TokenBaseKind_Operator); op(":"); op("..."); op("::"); op("++"); op("--"); op("."); op("->", "Arrow"); op("+"); op("-"); op("!"); op("~"); op("*"); op("&"); op(".*"); op("->*", "ArrowStar"); op("/"); op("%"); char_name('<', "Left"); char_name('>', "Right"); op("<<"); op(">>"); op("<=>", "Compare"); char_name('<', "Less"); char_name('>', "Grtr"); op("<"); op("<="); op(">"); op(">="); op("=="); op("!="); op("^"); op("|"); op("&&"); op("||"); op("?"); op("="); op("+="); op("-="); op("*="); op("/="); op("%="); char_name('<', "Left"); char_name('>', "Right"); op("<<="); op(">>="); op(","); // CPP Preprocess Operators Operator_Set *pp_ops = begin_op_set(); op("#", "PPStringify"); op("##", "PPConcat"); // CPP Keywords Keyword_Set *main_keys = begin_key_set(); select_base_kind(TokenBaseKind_Keyword); key("Void"); key("Bool"); key("Char"); key("Int"); key("Float"); key("Double"); key("Long"); key("Short"); key("Unsigned"); key("Signed"); key("Const"); key("Volatile"); key("Asm"); key("Break"); key("Case"); key("Catch"); key("Continue"); key("Default"); key("Do"); key("Else"); key("For"); key("Goto"); key("If"); key("Return"); key("Switch"); key("Try"); key("While"); key("StaticAssert", "static_assert"); key("ConstCast", "const_cast"); key("DynamicCast", "dynamic_cast"); key("ReinterpretCast", "reinterpret_cast"); key("StaticCast", "static_cast"); key("Class"); key("Enum"); key("Struct"); key("Typedef"); key("Union"); key("Template"); key("Typename"); key("Friend"); key("Namespace"); key("Private"); key("Protected"); key("Public"); key("Using"); key("Extern"); key("Export"); key("Inline"); key("Static"); key("Virtual"); key("AlignAs"); key("Explicit"); key("NoExcept"); key("NullPtr"); key("Operator"); key("Register"); key("This"); key("ThreadLocal", "thread_local"); key("SizeOf"); key("AlignOf"); key("DeclType"); key("TypeID"); key("New"); key("Delete"); key_unmatchable("KeywordGeneric"); select_base_kind(TokenBaseKind_LiteralInteger); key("LiteralTrue", "true"); key("LiteralFalse", "false"); select_base_kind(TokenBaseKind_Identifier); key_fallback("Identifier"); // CPP Preprocess Directives Keyword_Set *pp_directive_set = begin_key_set(); select_base_kind(TokenBaseKind_Preprocessor); key("PPInclude", "#include"); key("PPVersion", "#version"); key("PPDefine", "#define"); key("PPUndef", "#undef"); key("PPIf", "#if"); key("PPIfDef", "#ifdef"); key("PPIfNDef", "#ifndef"); key("PPElse", "#else"); key("PPElIf", "#elif"); key("PPEndIf", "#endif"); key("PPError", "#error"); key("PPImport", "#import"); key("PPUsing", "#using"); key("PPLine", "#line"); key("PPPragma", "#pragma"); key_fallback("PPUnknown"); // CPP Preprocess Keywords Keyword_Set *pp_keys = begin_key_set(); select_base_kind(TokenBaseKind_Keyword); key("PPDefined", "defined"); // State Machine State *root = begin_state_machine(); Flag *is_hex = add_flag(AutoZero); Flag *is_oct = add_flag(AutoZero); Flag *is_pp_body = add_flag(KeepState); Flag *is_include_body = add_flag(KeepState); Flag *is_wide = add_flag(AutoZero); Flag *is_utf8 = add_flag(AutoZero); Flag *is_utf16 = add_flag(AutoZero); Flag *is_utf32 = add_flag(AutoZero); Flag *is_char = add_flag(AutoZero); flag_bind(is_pp_body, TokenBaseFlag_PreprocessorBody); State *identifier = add_state(); State *whitespace = add_state(); State *whitespace_end_pp = add_state(); State *backslash = add_state(); State *operator_or_fnumber_dot = add_state(); State *operator_or_comment_slash = add_state(); State *number = add_state(); State *znumber = add_state(); State *fnumber_decimal = add_state(); State *fnumber_exponent = add_state(); State *fnumber_exponent_sign = add_state(); State *fnumber_exponent_digits = add_state(); State *number_hex = add_state(); State *number_oct = add_state(); State *U_number = add_state(); State *L_number = add_state(); State *UL_number = add_state(); State *LU_number = add_state(); State *l_number = add_state(); State *Ul_number = add_state(); State *lU_number = add_state(); State *LL_number = add_state(); State *ULL_number = add_state(); State *pp_directive = add_state(); State *include_pointy = add_state(); State *include_quotes = add_state(); State *pre_L = add_state(); State *pre_u = add_state(); State *pre_U = add_state(); State *pre_u8 = add_state(); State *pre_R = add_state(); State *character = add_state(); State *string = add_state(); State *string_esc = add_state(); State *string_esc_oc2 = add_state(); State *string_esc_oc1 = add_state(); State *string_esc_hex = add_state(); State *string_esc_universal_8 = add_state(); State *string_esc_universal_7 = add_state(); State *string_esc_universal_6 = add_state(); State *string_esc_universal_5 = add_state(); State *string_esc_universal_4 = add_state(); State *string_esc_universal_3 = add_state(); State *string_esc_universal_2 = add_state(); State *string_esc_universal_1 = add_state(); State *raw_string = add_state(); State *raw_string_get_delim = add_state(); State *raw_string_finish_delim = add_state(); State *raw_string_find_close = add_state(); State *raw_string_try_delim = add_state(); State *raw_string_try_quote = add_state(); State *comment_block = add_state(); State *comment_block_try_close = add_state(); State *comment_block_newline = add_state(); State *comment_line = add_state(); Operator_Set *main_ops_without_dot_or_slash = copy_op_set(main_ops); remove_ops_with_prefix(main_ops_without_dot, "."); remove_ops_with_prefix(main_ops_without_dot, "/"); Operator_Set *main_ops_with_dot = copy_op_set(main_ops); remove_ops_without_prefix(main_ops_with_dot, "."); ops_string_skip(main_ops_with_dot, 1); //// select_state(root); sm_case("abcdefghijklmnopqrstvwxyz" "ABCDEFGHIJKMNOPQSTVWXYZ" "_$", identifier); sm_case(utf8, identifier); sm_case("L", pre_L); sm_case("u", pre_u); sm_case("U", pre_U); sm_case("R", pre_R); sm_case(" \r\t\f\v", whitespace); sm_case("\n", whitespace_end_pp); sm_case("\\", backslash); sm_case(".", operator_or_fnumber_dot); sm_case("/", operator_or_comment_slash); { Character_Set *char_set = new_char_set(); char_set_union_ops(char_set, main_ops_without_dot_or_slash); char_set_remove(char_set, ".", emit); } { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LexError"); sm_fallback(emit); } //// select_state(include_quotes); sm_case("abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "_. /\\" "0123456789", include_pointy); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("PPIncludeFile"); sm_case("\"", emit); } { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LexError"); sm_fallback(emit); } //// select_state(pre_L); sm_set_flag(is_wide, true); sm_case("\"", string); sm_case("R", pre_R); sm_fallback_peek(identifier); //// select_state(pre_u); sm_set_flag(is_utf16, true); sm_case("\"", string); sm_case("8", pre_u8); sm_case("R", pre_R); sm_fallback_peek(identifier); //// select_state(pre_U); sm_set_flag(is_utf32, true); sm_case("\"", string); sm_case("R", pre_R); sm_fallback_peek(identifier); //// select_state(pre_u8); sm_set_flag(is_utf8, true); sm_case("\"", string); sm_case("R", pre_R); sm_fallback_peek(identifier); //// select_state(pre_R); sm_case("\"", raw_string); sm_fallback_peek(identifier); //// select_state(character); sm_set_flag(is_char, true); sm_fallback_peek(string); //// select_state(string); { Emit_Rule *emit = add_emit_rule(); handler_token_kind(is_wide, "LiteralStringWide"); handler_token_kind(is_utf8 , "LiteralStringUTF8"); handler_token_kind(is_utf16, "LiteralStringUTF16"); handler_token_kind(is_utf32, "LiteralStringUTF32"); handler_token_kind("LiteralString"); sm_case_flagged(is_char, false, "\"", emit); } { Emit_Rule *emit = add_emit_rule(); handler_token_kind(is_wide, "LiteralCharacterWide"); handler_token_kind(is_utf8 , "LiteralCharacterUTF8"); handler_token_kind(is_utf16, "LiteralCharacterUTF16"); handler_token_kind(is_utf32, "LiteralCharacterUTF32"); handler_token_kind("LiteralCharacter"); sm_case_flagged(is_char, true, "\'", emit); } sm_case("\\", string_esc); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LexError"); sm_case_peek("\n", emit); } sm_case_flagged(is_char, true, "\"", string); sm_case_flagged(is_char, false, "\'", string); sm_fallback(string); //// select_state(string_esc); sm_case("'\"?\\abfnrtv", string); sm_case("01234567", string_esc_oct2); sm_case("x", string_esc_hex); sm_case("u", string_esc_universal_4); sm_case("U", string_esc_universal_8); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LexError"); sm_case_peek("\n", emit); } sm_fallback(string); //// select_state(string_esc_oct2); sm_case("01234567", string_esc_oct1); sm_fallback_peek(string); //// select_state(string_esc_oct1); sm_case("01234567", string); sm_fallback_peek(string); //// select_state(string_esc_hex); sm_case("0123456789abcdefABCDEF", string_esc_hex); sm_fallback_peek(string); //// select_state(string_esc_universal_8); sm_case("0123456789abcdefABCDEF", string_esc_universal_7); sm_fallback_peek(string); //// select_state(string_esc_universal_7); sm_case("0123456789abcdefABCDEF", string_esc_universal_6); sm_fallback_peek(string); //// select_state(string_esc_universal_6); sm_case("0123456789abcdefABCDEF", string_esc_universal_5); sm_fallback_peek(string); //// select_state(string_esc_universal_5); sm_case("0123456789abcdefABCDEF", string_esc_universal_4); sm_fallback_peek(string); //// select_state(string_esc_universal_4); sm_case("0123456789abcdefABCDEF", string_esc_universal_3); sm_fallback_peek(string); //// select_state(string_esc_universal_3); sm_case("0123456789abcdefABCDEF", string_esc_universal_2); sm_fallback_peek(string); //// select_state(string_esc_universal_2); sm_case("0123456789abcdefABCDEF", string_esc_universal_1); sm_fallback_peek(string); //// select_state(string_esc_universal_1); sm_case("0123456789abcdefABCDEF", string); sm_fallback_peek(string); //// select_state(raw_string); sm_delim_mark_first(); sm_fallback_peek(raw_string_get_delim); //// select_state(raw_string_get_delim); sm_case_peek("(", raw_string_finish_delim); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LexError"); sm_case(" \\)", emit); } sm_fallback(raw_string_get_delim); //// select_state(raw_string_finish_delim); sm_delim_mark_one_past_last(); sm_fallback(raw_string_find_close); //// select_state(raw_string_find_close); sm_case(")", raw_string_try_delim); sm_fallback(raw_string_find_close); //// select_state(raw_string_try_delim); sm_match_delim(raw_string_try_quote); sm_fallback_peek(raw_string_find_close); //// select_state(raw_string_try_quote); { Emit_Rule *emit = add_emit_rule(); handler_token_kind(is_wide, "LiteralStringWideRaw"); handler_token_kind(is_utf8 , "LiteralStringUTF8Raw"); handler_token_kind(is_utf16, "LiteralStringUTF16Raw"); handler_token_kind(is_utf32, "LiteralStringUTF32Raw"); handler_token_kind("LiteralStringRaw"); sm_case("\"", emit); } sm_fallback_peek(raw_string_find_close); //// select_state(comment_block); sm_case("*", comment_block_try_close); sm_case("\n", comment_block_newline); sm_fallback(comment_block); //// select_state(comment_block_try_close); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("BlockComment"); sm_case("/", emit); } sm_case("*", comment_block_try_close); sm_fallback(comment_block); //// select_state(comment_block_newline); sm_set_flag(is_pp_body, false); sm_set_flag(is_include_body, false); sm_fallback_peek(comment_block); //// select_state(comment_line); { Emit_Rule *emit = add_emit_rule(); handler_token_kind("LineComment"); sm_case_peek("\n", emit); } sm_fallback(comment_line); } // BOTTOM