From ff4c5f0e9de538687d9ea6f3f7c9289183652894 Mon Sep 17 00:00:00 2001 From: Thomas Walker Lynch Date: Sat, 10 May 2025 04:41:39 -0700 Subject: [PATCH] working #macro directive, albeit still with embedded test messages --- .../how_it_works/cpp.org" | 0 .../how_it_works/cpp_reader.org" | 0 .../how_it_works/lexing.org" | 230 ++++++++++++++++++ .../tool_chain_dependency_layers.org" | 0 .../get_paren_balanced_chars.cc" | 48 ++++ "script_gcc_min-12\360\237\226\211/macro.cc" | 134 +++++++--- 6 files changed, 384 insertions(+), 28 deletions(-) rename "document\360\237\226\211/source/cpp.org" => "document\360\237\226\211/how_it_works/cpp.org" (100%) rename "document\360\237\226\211/source/cpp_reader.org" => "document\360\237\226\211/how_it_works/cpp_reader.org" (100%) create mode 100644 "document\360\237\226\211/how_it_works/lexing.org" rename "document\360\237\226\211/tool_chain_dependency_layers.org" => "document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" (100%) create mode 100644 "script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" diff --git "a/document\360\237\226\211/source/cpp.org" "b/document\360\237\226\211/how_it_works/cpp.org" similarity index 100% rename from "document\360\237\226\211/source/cpp.org" rename to "document\360\237\226\211/how_it_works/cpp.org" diff --git "a/document\360\237\226\211/source/cpp_reader.org" "b/document\360\237\226\211/how_it_works/cpp_reader.org" similarity index 100% rename from "document\360\237\226\211/source/cpp_reader.org" rename to "document\360\237\226\211/how_it_works/cpp_reader.org" diff --git "a/document\360\237\226\211/how_it_works/lexing.org" "b/document\360\237\226\211/how_it_works/lexing.org" new file mode 100644 index 0000000..a6bed25 --- /dev/null +++ "b/document\360\237\226\211/how_it_works/lexing.org" @@ -0,0 +1,230 @@ +#+TITLE: GCC libcpp Lexer: Structure, Usage, and Extension +#+AUTHOR: Caelus (OpenAI) and Thomas Walker Lynch +#+DATE: 2025-05-09 + +* Overview +The C preprocessor lexer (`lex.cc`) in GCC's `libcpp` is responsible for scanning raw source characters and emitting `cpp_token` structures. It is Unicode-aware, macro-sensitive, context-tracking, and supports multiple levels of token buffering. This lexer is both a general-purpose lexical analyzer and a specialized component for preprocessing. + +This document provides: +1. An architectural overview of how the lexer operates. +2. Guidance on how to interface with it (i.e., how to invoke, initialize, and consume it). +3. Examples demonstrating token flow and useful idioms. + +* 1. About the Lexer + +** 1.1 Services Provided +The lexer transforms a stream of characters into a stream of `cpp_token`s. It performs: +- UCN (Universal Character Name) expansion. +- Unicode normalization for identifiers. +- Detection of digraphs/trigraphs. +- Skipping of whitespace and comments. +- Classification into token types (`cpp_ttype`). +- Optional macro expansion (via higher-level coordination with macro subsystem). + +The function `_cpp_lex_token()` is the main entry point for lexing one token from the input stream. + +** 1.2 Token Types and Structures +Tokens are represented as `struct cpp_token`, which contains: +- `type`: token kind (from `cpp_ttype`) +- `val`: a union holding the value (e.g. number, string, identifier) +- `flags`: indicators such as `PREV_WHITE` or `DIGRAPH` +- `src_loc`: location for diagnostics +- `spelling`: optional cached spelling (may be recomputed) + +Auxiliary structures include: +- `cpp_hashnode`: interned identifiers and macro names +- `normalize_state`: for handling normalization and BiDi context +- `_cpp_buff`: dynamic buffers used for temporary token storage + +** 1.3 Unicode and Normalization +Lexer supports bidirectional Unicode enforcement using: +- `context`, `normalize_state`: track BiDi embeddings and UCN states +- `on_char`, `on_close`, `maybe_warn_bidi_on_close`: enforce structure + +** 1.4 Vectorized Fast Path +Several functions (e.g. `search_line_sse2`) accelerate scanning on x86 via SIMD. These are conditionally invoked from `search_line_fast` when alignment and CPU features allow. + +** 1.5 Token Buffers and Pools +Token buffers are managed using `_cpp_get_buff`, `_cpp_extend_buff`, `_cpp_commit_buff`, and `_cpp_release_buff`. These form a scratch/reuse pool and reduce allocations in macro processing or lexing multiple tokens rapidly. + +* 2. How to Use the Lexer API + +** 2.1 Initialization +Before lexing, the preprocessor must initialize its state: + +#+begin_src c +cpp_reader *pfile = cpp_create_reader(GTK_TESTING, NULL, NULL); +_cpp_init_lexer(pfile); +_cpp_init_tokenrun(pfile); +#+end_src + +** 2.2 Lexing Tokens +To retrieve the next token: + +#+begin_src c +const cpp_token *token = _cpp_lex_token(pfile); +#+end_src + +For directive-specific parsing (no macro expansion): + +#+begin_src c +cpp_token *token = _cpp_lex_direct(pfile); +#+end_src + +** 2.3 Token Inspection +Each token has type and value fields: + +#+begin_src c +if (token->type == CPP_NUMBER) { + printf("Numeric token: %s\n", cpp_spell_token(pfile, token)); +} +#+end_src + +** 2.4 Identifier Handling +Lex identifiers directly (e.g., for macro lookup): + +#+begin_src c +cpp_hashnode *node = _cpp_lex_identifier(pfile); +if (cpp_macro_p(node)) { + // Node is a macro +} +#+end_src + +** 2.5 Stringification and Output +To spell a token or output lines: + +#+begin_src c +unsigned char *text = cpp_token_as_text(pfile, token); +cpp_output_token(pfile, token, stdout); +#+end_src + +* 3. Examples and Advanced Use + +** 3.1 Simple Token Stream +Lex a stream from input and print token types: + +#+begin_src c +while (true) { + const cpp_token *tok = _cpp_lex_token(pfile); + if (tok->type == CPP_EOF) + break; + printf("Token: %s\n", cpp_type2name(tok->type)); +} +#+end_src + +** 3.2 Peeking and Lookahead +Use `cpp_peek_token` to look ahead: + +#+begin_src c +const cpp_token *next = cpp_peek_token(pfile); +if (next->type == CPP_OPEN_PAREN) + printf("Function call?\n"); +#+end_src + +** 3.3 Handling Unicode Identifiers +To support identifiers with UCNs: + +#+begin_src c +cpp_hashnode *ident = _cpp_lex_identifier(pfile); +const uchar *spell = _cpp_spell_ident_ucns(pfile, ident); +printf("Normalized: %s\n", spell); +#+end_src + +** 3.4 Example: Skipping Comments +Use `_cpp_skip_block_comment` or `skip_line_comment`: + +#+begin_src c +bool changed_line = _cpp_skip_block_comment(pfile); +if (changed_line) + _cpp_clean_line(pfile); +#+end_src + +** 3.5 Buffer Usage Examples + +*** 3.5.1 Allocate and Fill a Temporary Buffer +Use `_cpp_get_buff` to allocate a scratch buffer. Always check and ensure space before writing. Then commit the buffer and retrieve its contents. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +size_t len = 5; // Number of bytes to write + +// Ensure buffer has enough room +if ((size_t)(buff->limit - buff->cur) < len) + _cpp_extend_buff(pfile, &buff); + +// Write data safely +memcpy(buff->cur, "hello", len); +buff->cur += len; + +// Commit buffer and retrieve stable pointer +unsigned char *data = (unsigned char *) _cpp_commit_buff(pfile, buff, len); +printf("Buffer contents: %.*s\n", (int)len, data); +#+end_src +*** 3.5.2 Extend a Buffer Dynamically +Extend a buffer when you exceed its original size. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); + +// Simulate a long write +for (int i = 0; i < 300; ++i) { + if ((size_t)(buff->limit - buff->cur) < 1) { + _cpp_extend_buff(pfile, &buff); + } + *buff->cur++ = 'A'; +} + +unsigned char *text = (unsigned char *) _cpp_commit_buff(pfile, buff, 300); +printf("Expanded buffer: %.*s\n", 10, text); // First 10 chars +#+end_src + +*** 3.5.3 Use Buffers in Token Construction +Construct a macro expansion or synthetic token string. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +buff->cur = stpcpy((char *)buff->cur, "MY_MACRO("); +buff->cur = stpcpy((char *)buff->cur, "123 + 456"); +*buff->cur++ = ')'; + +unsigned char *macro_text = (unsigned char *) _cpp_commit_buff(pfile, buff, + buff->cur - buff->base); +printf("Token string: %s\n", macro_text); +#+end_src + +*** 3.5.4 Releasing a Buffer +After using a buffer temporarily (e.g., in lookahead), release it. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +// ... use the buffer ... +_cpp_release_buff(pfile, buff); +#+end_src + +*** 3.5.5 Commit and Reuse +After committing a buffer, you may allocate another for reuse: + +#+begin_src c +unsigned char *first = (unsigned char *) _cpp_commit_buff(pfile, buff, len); +_cpp_buff *next = _cpp_get_buff(pfile); +// next->base points to fresh or recycled memory +#+end_src + +* 4. Notes on Extension + +- You may insert a new directive (e.g., `#assign`) by defining it in `directives.cc` and adding handler logic in `macro.cc` or your own file. +- If you want to extend the lexer for new token kinds, you must: + - Add a new `cpp_ttype` enum value. + - Extend `_cpp_lex_token` or `lex_string` to recognize and classify it. + - Update `cpp_type2name` and spelling functions. + +* 5. Recommended Reading +- `libcpp/include/cpp-id-data.h`: For macro flags and token identifiers +- `libcpp/lex.cc`: Lexer core implementation +- `libcpp/directives.cc`: Directive parsing +- `libcpp/macro.cc`: Macro expansion +- `libcpp/line-map.cc`: Location tracking and diagnostics + + + + diff --git "a/document\360\237\226\211/tool_chain_dependency_layers.org" "b/document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" similarity index 100% rename from "document\360\237\226\211/tool_chain_dependency_layers.org" rename to "document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" diff --git "a/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" new file mode 100644 index 0000000..21cd614 --- /dev/null +++ "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" @@ -0,0 +1,48 @@ +bool +cpp_get_paren_balanced_chars(cpp_reader *pfile) +{ + const unsigned char *cur = pfile->buffer->cur; + const unsigned char *rlimit = pfile->buffer->rlimit; + cpp_buffer *buffer = pfile->buffer; + + // Skip leading whitespace + while (cur < rlimit && ISSPACE(*cur)) + cur++; + + if (cur == rlimit || *cur != '(') + { + cpp_error(pfile, CPP_DL_ERROR, "expected opening parenthesis for macro body"); + return false; + } + + int depth = 0; + const unsigned char *scan = cur; + while (scan < rlimit) + { + if (*scan == '(') + depth++; + else if (*scan == ')') + { + depth--; + if (depth == 0) + { + // Copy from opening ( to matching ) inclusive + size_t len = scan - cur + 1; + unsigned char *copy = (unsigned char *)_cpp_unaligned_alloc(pfile, len + 1); + memcpy(copy, cur, len); + copy[len] = '\0'; + + // Point lexer buffer to just this region + buffer->cur = copy; + buffer->rlimit = copy + len; + buffer->next_line = NULL; // Signals EOF to lexer + return true; + } + } + scan++; + } + + // If we got here, closing paren was never found + cpp_error(pfile, CPP_DL_ERROR, "unclosed parenthesis in macro body"); + return false; +} diff --git "a/script_gcc_min-12\360\237\226\211/macro.cc" "b/script_gcc_min-12\360\237\226\211/macro.cc" index 82d8b4b..d491054 100644 --- "a/script_gcc_min-12\360\237\226\211/macro.cc" +++ "b/script_gcc_min-12\360\237\226\211/macro.cc" @@ -4138,7 +4138,7 @@ cpp_macro_definition (cpp_reader *pfile, cpp_hashnode *node, // see directives.cc extern const char *cpp_token_as_text(const cpp_token *token); -// a helper function for probing where we are at in the parse +// a helper function for probing where the parser thinks it is in the source void debug_peek_token (cpp_reader *pfile) { @@ -4156,22 +4156,48 @@ debug_peek_token (cpp_reader *pfile) _cpp_backup_tokens(pfile, 1); } +// collects the body of a #define or related directive static bool -collect_macro_body_tokens (cpp_reader *pfile, - cpp_macro *macro, - unsigned int *num_extra_tokens_out, - const char *paste_op_error_msg) -{ +collect_body_tokens( + cpp_reader *pfile + ,cpp_macro *macro + ,unsigned int *num_extra_tokens_out + ,const char *paste_op_error_msg + ,bool paren_matching +){ bool following_paste_op = false; unsigned int num_extra_tokens = 0; + int paren_depth; + cpp_token *token; + + if(paren_matching){ + token = _cpp_lex_direct(pfile); + if(token->type != CPP_OPEN_PAREN){ + cpp_error_with_line( + pfile + ,CPP_DL_ERROR + ,token->src_loc + ,0 + ,"expected body delimiter '(', but found: %s" + ,cpp_token_as_text(token) + ); + fprintf(stderr, "exiting collect_body_tokens did not find opening paren\n"); + return false; + } + paren_depth = 1; + fprintf( stderr, "entry paren_depth: %d\n", paren_depth); + } for (vaopt_state vaopt_tracker (pfile, macro->variadic, NULL);; ) { - cpp_token *token = NULL; - + // gets a token + // first parses token onto `macro->exp.tokens[macro->count]` + // then pulls the token off of `macro->exp.tokens[macro->count]` macro = lex_expansion_token(pfile, macro); token = ¯o->exp.tokens[macro->count++]; + fprintf( stderr, "top of loop, read token %s\n", cpp_token_as_text(token) ); + // recognize macro args, give them type CPP_MACRO_ARG if (macro->count > 1 && token[-1].type == CPP_HASH && macro->fun_like) { if (token->type == CPP_MACRO_ARG @@ -4193,27 +4219,57 @@ collect_macro_body_tokens (cpp_reader *pfile, { cpp_error(pfile, CPP_DL_ERROR, "'#' is not followed by a macro parameter"); + fprintf(stderr, "exiting collect_body_tokens not a macro arg and language is not ASM\n"); return false; } } - if (token->type == CPP_EOF) - { - if (following_paste_op) - { - cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); - return false; - } - if (!vaopt_tracker.completed()) + // parentheses matching overhead + if(paren_matching){ + if( token->type == CPP_OPEN_PAREN || token->type == CPP_CLOSE_PAREN){ + if(token->type == CPP_OPEN_PAREN) paren_depth++; + if(token->type == CPP_CLOSE_PAREN) paren_depth--; + fprintf( stderr, "new paren_depth: %d\n", paren_depth); + } + + if(token->type == CPP_EOF){ + fprintf(stderr, "Found CPP_EOF at paren depth %d\n", paren_depth); + macro->count--; + if(!_cpp_get_fresh_line(pfile)){ + fprintf(stderr, "exiting collect_body_tokens _cpp_get_fresh_line failed\n"); return false; - break; + } + fprintf(stderr, "Found CPP_EOF at depth %d read new line now continuing loop \n", paren_depth); + continue; } + } + + // exit loop at the end of the macro body + if( + paren_matching && paren_depth == 0 + || !paren_matching && token->type == CPP_EOF + ){ + fprintf(stderr, "exiting macro body collect loops\n"); + if(following_paste_op){ + cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); + fprintf( stderr, "exiting collect_body_tokens due to following_past_op\n"); + return false; + } + if( !vaopt_tracker.completed() ){ + fprintf( stderr, "exiting collect_body_tokens due to !vaopt_tracker.completed()\n"); + return false; + } + *num_extra_tokens_out = num_extra_tokens; + macro->count--; // drop the terminator + return true; + } if (token->type == CPP_PASTE) { if (macro->count == 1) { cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); + fprintf( stderr, "exiting collect_body_tokens paste event\n"); return false; } @@ -4233,18 +4289,18 @@ collect_macro_body_tokens (cpp_reader *pfile, } following_paste_op = true; } - else + else{ following_paste_op = false; + } - if (vaopt_tracker.update(token) == vaopt_state::ERROR) + if (vaopt_tracker.update(token) == vaopt_state::ERROR){ + fprintf( stderr, "exiting collect_body_token due to vaopt_tracker.update(token) == vaopt_state::ERROR\n"); return false; + } } - *num_extra_tokens_out = num_extra_tokens; - return true; } - //-------------------------------------------------------------------------------- // for `#macro` directive /* @@ -4271,6 +4327,9 @@ create_iso_macro (cpp_reader *pfile) bool ok = false; cpp_macro *macro = NULL; +int saved_in_directive = pfile->state.in_directive; +int saved = pfile->keep_tokens; + /* -Saves token allocation address held in pfile->cur_token. -Gives a new token allocation address to pfile->cur_token, that of cpp_token first. @@ -4330,16 +4389,35 @@ create_iso_macro (cpp_reader *pfile) macro->parm.params = params; macro->fun_like = true; - // collects from pfile the tokens that constitute the macro body - if (!collect_macro_body_tokens(pfile, macro, &num_extra_tokens, paste_op_error_msg)) - goto out; + /* + Collect the macro body tokens. + A #macro () body is delineated by parentheses + */ + + +pfile->state.in_directive = 0; // allow fresh lines +pfile->keep_tokens = 1; + + // collects the remaining body tokens + if( + !collect_body_tokens( + pfile + ,macro + ,&num_extra_tokens + ,paste_op_error_msg + ,true + ) + ) goto out; + +pfile->keep_tokens = saved; +pfile->state.in_directive = saved_in_directive; // restore + + + // At this point, even if the body parse fails, we will say we made a macro. I'm not sure why as we haven't commited it yet, but this is what is in the code. Apparently we throw away the macro if the body does not parse. ok = true; - /* Don't count the CPP_EOF. */ - macro->count--; - // commit the cpp struct to memory // the struct reserves space for one token, the others run off the end macro = (cpp_macro *)_cpp_commit_buff( -- 2.20.1