From: Thomas Walker Lynch Date: Sat, 10 May 2025 11:41:39 +0000 (-0700) Subject: working #macro directive, albeit still with embedded test messages X-Git-Url: https://git.reasoningtechnology.com/usr/lib/python2.7/encodings/unicode_internal.py?a=commitdiff_plain;h=ff4c5f0e9de538687d9ea6f3f7c9289183652894;p=RT-gcc working #macro directive, albeit still with embedded test messages --- diff --git "a/document\360\237\226\211/how_it_works/cpp.org" "b/document\360\237\226\211/how_it_works/cpp.org" new file mode 100644 index 0000000..b38c7ba --- /dev/null +++ "b/document\360\237\226\211/how_it_works/cpp.org" @@ -0,0 +1,503 @@ +#+TITLE: C Preprocessor Overview +#+AUTHOR: Thomas Walker Lynch & Caelestis Index +#+DESCRIPTION: High-level architectural partitioning of cpp (GCC 12.x) +#+FILETAGS: cpp preprocessor architecture gcc +#+OPTIONS: toc:nil + +* Preprocessing Pipeline (Diagram) + +#+BEGIN_SRC text + C Preprocessor (cpp) + ===================== + ++----------------------+ +| Source Code | ++----------------------+ + | + v ++----------------------+ +| Lexical Analysis | <- Part of: Lexical Analysis +| (tokenize input) | ++----------------------+ + | + v ++----------------------+ +| Directive Engine | <- Part of: Directive Handling +| (#define, #if, etc.) | ++----------------------+ + | + v ++----------------------+ +| Conditional Logic | <- Part of: Conditional Compilation +| (#if/#ifdef/#else) | ++----------------------+ + | + v ++----------------------+ +| Macro Expansion | <- Part of: Macro Expansion +| (object/function) | ++----------------------+ + | + v ++----------------------+ +| Callback Hooks | <- Part of: Hook and Callback Interface +| (cpp_callbacks) | ++----------------------+ + | + v ++----------------------+ +| Output Tokens | <- Output stream to compiler frontend +| (to GCC parser) | ++----------------------+ +#+END_SRC + +Each block corresponds to a major processing stage in `cpp`. The functional groups defined earlier align to these blocks as indicated, though some (like state management and diagnostics) operate globally across the pipeline. + + + +* Major Functional Partitions of the C Preprocessor (cpp) + +This section outlines the primary architectural components of the C preprocessor as implemented in GCC 12.x. These functional partitions help frame how cpp processes input and how its internal modules interact. + +** 1. Lexical Analysis +- Tokenizes input into =cpp_token= streams. +- Decodes: + - UTF-8 characters + - Trigraphs (e.g., =??=) + - Digraphs (e.g., =<: = for =[=) +- Central structure: =cpp_lexer= +- Produces tokens for macro expansion and conditional evaluation. +** 2. Directive Handling +- Processes all =#= directives, including: + - =#define=, =#undef=, =#include=, =#line=, =#error=, =#pragma= + - Extended directives like =#assign=, =#call= if supported. +- Managed via =directive_table= and dispatch functions like =do_define=, =do_include=, etc. + +** 3. Conditional Compilation +- Handles constructs like: + - =#if=, =#ifdef=, =#ifndef=, =#elif=, =#else=, =#endif= +- Used to include or exclude code based on macro definitions and constant expressions. +- Driven by the =if_stack= in =cpp_reader=. +- Central to controlling variant builds, platform-specific code, or staged compilation. +** 4. File Inclusion and Search Paths +- Resolves =#include= and maintains include history. +- Handles: + - System vs user includes (<...> vs "..."). + - Include path resolution via =cpp_search_path=. + - File change tracking via =file_stack=. +** 5. Macro Expansion +- Handles object-like and function-like macros: + - =#define PI 3.14= + - =#define SQR(x) ((x)*(x))= +- Manages: + - Argument collection and expansion + - Token-pasting (=##=) and stringification (=#=) +- Involves =macro_table=, =collect_args=, and =expand_macro()= + +** 6. Diagnostics and Error Recovery +- Reports syntax errors, macro misuse, directive misuse. +- Uses: + - =cpp_error=, =cpp_warning=, =cpp_notice= + - Tracks macro nesting, input location, and file state for context. + +** 7. Hook and Callback Interface +- Interface: =cpp_callbacks= +- Allows frontend or plugin to observe: + - Macro definitions + - File changes + - Token output stream +- Enables debugging tools, IDEs, or language servers to integrate preprocessor awareness. + +** 8. State Management and Scoping +- Maintains global and file-level preprocessor state. +- Tracks: + - Nested conditional state via =if_stack= + - Macro table lifetimes and shadowing + - Include guards and =#pragma once= heuristics + + +* cpplib.h -- Application Interface Overview + +This section documents the **interface** and **in-memory model** of the C preprocessor (`libcpp`) from GCC 12.2.0. +It covers core data structures (tokens, macros, readers) and the primary functions for working with them. + +** Key Data Structures + +*** Token & Token Metadata +- `enum cpp_ttype` :: All possible token types (operators, names, literals, etc.) +- `struct cpp_token` :: Represents a token in the stream (with union-based payload) +- `enum cpp_token_fld_kind` :: Discriminates the active field in `cpp_token.val` +- `struct cpp_string` :: Raw string representation with length and pointer + +*** Macros & Identifiers +- `struct cpp_macro` :: Describes macro kind, parameter list, and token expansion +- `enum cpp_macro_kind` :: ISO-style, traditional-style, and assertion macros +- `struct cpp_identifier` :: Canonical and original spellings of a name +- `struct cpp_macro_arg` :: Argument number and spelling for macro arguments + +*** Symbol Table +- `struct cpp_hashnode` :: Hash table node for identifiers/macros +- `enum node_type` :: Distinguishes macro types (arg/user/builtin) +- `union _cpp_hashnode_value` :: Payload (macro, arg index, etc.) +- `enum cpp_builtin_type` :: Reserved built-ins like `__LINE__`, `__FILE__`, `_Pragma` + +*** Reader & Configuration +- `struct cpp_reader` :: Forward-declared. Central structure for preprocessing. +- `struct cpp_options` :: Stores all language mode flags, warning flags, and feature toggles. +- `struct cpp_callbacks` :: Client hook interface for diagnostic, macro, and file events. +- `struct cpp_dir` :: Represents an `#include` search directory. + +*** Numerics +- `struct cpp_num` :: Two-part 64-bit integer (high, low), overflow flags +- `cpp_classify_number` :: Categorizes radix/type (e.g., `0x`, `u`, `LL`) +- Defines :: `CPP_N_*` classify bits (INTEGER, FLOATING, WIDTH, RADIX, SUFFIX) + +*** Charset Handling +- `typedef cppchar_t` :: 32-bit safe character representation +- `struct cpp_decoded_char` :: Result of UTF-8 decoding step +- `struct cpp_char_column_policy` :: Visual column handling for diagnostics +- `class cpp_display_width_computation` :: Converts UTF-8 sequence to visual width + +*** Comment Tracking +- `struct cpp_comment`, `cpp_comment_table` :: Captures all parsed comments (if enabled) + +** Core Functions + +*** Lifecycle & Reader Setup +- `cpp_create_reader(enum c_lang, ...)` :: Allocates and initializes `cpp_reader` +- `cpp_finish`, `cpp_destroy` :: Finalize and free the reader +- `cpp_post_options` :: Commit option changes after parsing flags + +*** Preprocessing Input +- `cpp_read_main_file` :: Begin reading and preprocessing a source file +- `cpp_get_token()` :: Fetch next token from stream +- `cpp_peek_token()` :: Peek ahead without consuming +- `cpp_backup_tokens()` :: Push tokens back for re-parsing +- `cpp_retrofit_as_include()` :: Treat main file as if included + +*** Macro System +- `cpp_define()`, `cpp_define_unused()`, `cpp_define_lazily()` :: Define macros +- `cpp_macro_definition()` :: Dump macro body as string +- `cpp_compare_macros()` :: Deep compare two macros +- `cpp_undef()`, `cpp_undef_all()` :: Remove macro(s) +- `cpp_set_deferred_macro()`, `cpp_get_deferred_macro()` :: Lazy macro substitution + +*** Symbol Lookup +- `cpp_lookup()` :: Lookup or create an identifier hashnode +- `cpp_forall_identifiers()` :: Iterate over all identifiers + +*** String & Char Evaluation +- `cpp_interpret_charconst()` :: Parse a character constant (e.g. `'a'`) +- `cpp_interpret_string()` :: Parse string literal(s) into `cpp_string` +- `cpp_interpret_integer()` :: Parse numeric token into `cpp_num` + +*** Diagnostics +- `cpp_error()`, `cpp_warning()`, `cpp_pedwarning()` :: General messages +- `cpp_error_at()` :: Message with source location (rich_location optional) +- `cpp_errno()` / `cpp_errno_filename()` :: Errors based on `errno` +- `cpp_warning_with_line()` :: Fallback location-based warnings +- `cpp_get_callbacks()` / `cpp_set_callbacks()` :: Manage diagnostic hooks + +*** Extension Hooks & Pragma +- `cpp_register_pragma()` :: Register custom `#pragma` handler +- `cpp_get_callbacks()` :: Access to client-supplied hook table +- `cpp_define_formatted()` :: Macro with `printf`-style input +- `cpp_directive_only_process()` :: Run directive-only logic on a token stream + +*** Includes & File Management +- `cpp_set_include_chains()` :: Set system and user include paths +- `cpp_push_buffer()` :: Manually push a buffer for parsing +- `cpp_included()`, `cpp_included_before()` :: Has this file been included? +- `cpp_get_converted_source()` :: Read a file in input charset, return decoded buffer + +** Token Types (cpp_ttype) + +A full enumeration of all tokens in the preprocessor: +- Operators: `CPP_PLUS`, `CPP_MINUS`, `CPP_EQ_EQ`, etc. +- Punctuation: `CPP_OPEN_PAREN`, `CPP_HASH`, `CPP_SEMICOLON` +- Literals: `CPP_STRING`, `CPP_WCHAR`, `CPP_NUMBER` +- Special: `CPP_MACRO_ARG`, `CPP_PRAGMA`, `CPP_EOF` + +Each token has: +- Type (`enum cpp_ttype`) +- Flags (`PREV_WHITE`, `DIGRAPH`, `NO_EXPAND`, etc.) +- Source location +- Union payload (e.g., string, macro arg, hashnode) + +** Interface Concepts Beyond Code +*** Unicode Handling +- Input is normalized per `cpp_normalize_level` +- UTF-8 is expanded into 32-bit code points (`cppchar_t`) +- Display width of characters is estimated for diagnostics +- Bidi (bidirectional) controls are optionally scanned/warned + +*** Client Extension Hooks +- Most preprocessing operations (macro use, `#include`, comments, errors) are callback-hooked +- Used by GCC frontend to track macro use, implement diagnostics, and guide `#pragma` processing + +*** Dependency Generation +- `cpp_finish()` accepts an output stream for dependency info +- Options control whether main file is included, phony targets are added, etc. + +** Summary + +`cpplib.h` serves as both API contract and internal representation guide. +- It offers a high-fidelity view of source tokens for later compiler stages. +- The entire macro system, character encoding, and diagnostic lifecycle are managed through this interface. + + + + +* Callback Hooks (cpp_callbacks) + +The `cpp_callbacks` struct in `cpplib.h` allows external consumers (e.g., GCC frontend, IDE integrations, or plugins) to receive notifications during preprocessing. Each function pointer in this struct represents a hookable event. + +** Overview + +Hooks are triggered at specific stages: +- After macro definition or undefinition +- Before and after file inclusion +- When tokens are emitted +- Upon encountering diagnostics +- During comment scanning (if enabled) +- On encountering special directives (e.g., `#pragma`) + +** Hook Structure + +#+BEGIN_SRC c +struct cpp_callbacks { + void (*define)(cpp_reader *, source_location, const cpp_hashnode *); + void (*undef)(cpp_reader *, source_location, const cpp_hashnode *); + void (*include)(cpp_reader *, const char *filename, int angle_brackets); + void (*file_change)(cpp_reader *, const struct line_map *); + void (*line_change)(cpp_reader *, source_location, int to_file, int to_line); + void (*ident)(cpp_reader *, const cpp_string *); + void (*invalid_directive)(cpp_reader *); + void (*def_pragma)(cpp_reader *, const cpp_token *); + void (*cb_comment)(cpp_reader *, const cpp_token *); +}; +#+END_SRC + +Each callback receives either a pointer to the `cpp_reader`, the affected token or structure, and optional contextual data. + +--- + +** `define` + +*** Trigger +- Fired immediately after a macro is defined with `#define`. + +*** Parameters +- `cpp_reader *pfile`: global preprocessor state (read-write). +- `source_location loc`: location of the `#define`. +- `const cpp_hashnode *node`: the macro name and metadata (read-only in this context). + +*** Semantics +- The `cpp_hashnode` holds the macro's name and a pointer to its `cpp_macro` definition. +- Modifying the macro at this point is possible but discouraged. Use `cpp_undef()` + `cpp_define()` instead if redefinition is needed. + +*** Uses +- GCC uses this to update dependency tracking and debug tables. +- Tools may track macro definitions, emit logs, or enforce naming policies. + +--- + +** `undef` + +*** Trigger +- Fired after `#undef` removes a macro. + +*** Parameters +- Same as `define`. + +*** Semantics +- The node is marked `undefined`, but the symbol remains in the hash table. +- No mutation should occur—only inspection or logging. + +*** Uses +- Enables reversal tracking or macro scoping analysis. + +--- + +** `include` + +*** Trigger +- Fired just before a file is opened via `#include`. + +*** Parameters +- `cpp_reader *pfile` +- `const char *filename`: string from the include directive (not normalized). +- `int angle_brackets`: nonzero for `<...>`, zero for `"..."`. + +*** Semantics +- Purely informational; does not affect include search or suppression. +- The filename is unverified and not guaranteed to exist. + +*** Uses +- IDEs and build tools use this to build include graphs. +- LSPs use it to track file references and symbol origins. + +--- + +** `file_change` + +*** Trigger +- Called when the active input file changes (entry or exit of `#include`). + +*** Parameters +- `cpp_reader *pfile` +- `const struct line_map *map`: describes the current file's location and context. + +*** Semantics +- `line_map` gives full access to file/line/column mapping. +- This structure is read-only; mutating it will corrupt diagnostics and tokenization. + +*** Uses +- Debug info (DWARF line tables), logging, stack-based include tracking. + +--- + +** `line_change` + +*** Trigger +- Fired on `#line` directives or line-mapping transitions. + +*** Parameters +- `cpp_reader *pfile` +- `source_location loc`: location in input stream. +- `int to_file`: non-zero if a new file name is being used. +- `int to_line`: new logical line number. + +*** Semantics +- Use this to remap locations or re-synchronize overlays. +- These values are inputs to the line map; do not write back. + +*** Uses +- Used in DWARF debug info to support accurate line-based breakpoints. + +--- + +** `ident` + +*** Trigger +- Called when a `#ident` directive is parsed. + +*** Parameters +- `cpp_reader *pfile` +- `const cpp_string *text`: payload of the identifier message. + +*** Semantics +- Informational only. Common in legacy systems or codegen traces. + +*** Uses +- Collect module identity, versioning hints, or logmarks. + +--- + +** `invalid_directive` + +*** Trigger +- Fired when an unrecognized or malformed directive is encountered. + +*** Parameters +- `cpp_reader *pfile` + +*** Semantics +- Hook has no extra context; use `cpp_get_token()` to recover. +- Hook may trigger fallback behavior or custom directive logic. + +*** Uses +- Used in `-fpreprocessed` mode to suppress diagnostics. +- External tools can use this to extend the directive set. + +--- + +** `def_pragma` + +*** Trigger +- Fired when a `#pragma` directive is parsed. + +*** Parameters +- `cpp_reader *pfile` +- `const cpp_token *pragma`: token stream beginning with `CPP_PRAGMA`. + +*** Semantics +- Read-only access to token stream. +- Mutation possible via `cpp_push_buffer()` to inject expanded tokens. + +*** Uses +- GCC plugins hook this to implement custom `#pragma` behavior. +- Can trigger front-end features (like `#pragma GCC diagnostic`). + +--- + +** `cb_comment` + +*** Trigger +- Optional. Enabled if comment tracking is requested. + +*** Parameters +- `cpp_reader *pfile` +- `const cpp_token *comment`: holds text of comment. + +*** Semantics +- Only line/block comment content is captured, not semantics. +- Read-only token; do not mutate token payload. + +*** Uses +- Used by source-to-source translators and formatters. +- Some static analyzers inspect comments for hints or disables. + +--- + +** Summary + +The `cpp_callbacks` interface enables observational and limited transformational interaction with the preprocessor pipeline. + +- Most parameters are read-only or shallow copies. +- For transformations, prefer using `cpp_define()`, `cpp_push_buffer()`, or `cpp_backup_tokens()` externally. +- Internal structures like `cpp_reader`, `cpp_token`, and `cpp_macro` should not be mutated unless explicitly permitted. + + + +* Plugin-Like Integration in libcpp + +Unlike the main GCC compiler, which supports a formal plugin system (`gcc-plugin.h`), `libcpp` (the C preprocessor library) does *not* support plugins in the dynamic or runtime-loaded sense. There is no system for loading shared libraries, registering handlers via symbols, or extending preprocessor behavior through runtime modules. + +** Static Hook Interface via cpp_callbacks + +Instead, `libcpp` exposes a *statically defined interface* (`struct cpp_callbacks`) for embedding applications to receive notifications of preprocessor events. These include: + +- Macro definitions and undefinitions +- Source file entry/exit +- Comment and pragma parsing +- Token emission and buffer transitions + +An embedding client (such as GCC's C/C++ frontend, or a third-party tool using libcpp) may assign function pointers directly into this struct during reader setup. + +#+BEGIN_SRC c +cpp_reader *r = cpp_create_reader(...); +cpp_callbacks *cb = cpp_get_callbacks(r); +cb->macro_defined = my_macro_handler; +cb->file_change = my_file_tracker; +#+END_SRC + +This pattern is analogous to a *plugin interface*, but all logic is statically linked at compile time. + +** Mutability and Access Scope + +The callback interface is primarily **observational**—that is, hooks are expected to inspect events, not mutate the `cpp_reader` state directly. However, advanced users can, with care, reach into the data structures passed to them (e.g., `cpp_macro`, `cpp_hashnode`) and affect behavior, though this is neither documented nor officially supported. + +In summary: + +| Feature | GCC Frontend Plugin | libcpp Callback Interface | +|--------------------------+---------------------+----------------------------| +| Dynamically loadable | Yes | No | +| Runtime extension API | Yes (`gcc-plugin.h`) | No | +| Assign custom handlers | Yes | Yes (via `cpp_callbacks`) | +| Mutate core structures | With care | With care (not endorsed) | +| Stability across versions| Best-effort | Internal API, may break | + +** Recommendation + +Use `cpp_callbacks` as a read-only interface to monitor preprocessing behavior. If deeper mutation or instrumentation is required, consider modifying or forking `libcpp` itself. There is currently no officially supported way to extend it at runtime. diff --git "a/document\360\237\226\211/how_it_works/cpp_reader.org" "b/document\360\237\226\211/how_it_works/cpp_reader.org" new file mode 100644 index 0000000..bc87d15 --- /dev/null +++ "b/document\360\237\226\211/how_it_works/cpp_reader.org" @@ -0,0 +1,147 @@ +#+TITLE: cpp_reader: Preprocessor State and Interface Guide +#+AUTHOR: Caelestis Index +#+FILETAGS: cpp, GCC internals, preprocessor, architecture + +* Overview +The =cpp_reader= struct in GCC's =libcpp= encapsulates the complete state of a single C preprocessor session. It governs token input, macro expansion, directive parsing, include stack management, and source map resolution. It is the central state object passed through nearly all parts of the C preprocessor. + +* 1. State Data + +** 1.1 Buffer and Lexing State +- ~buffer~, ~overlaid_buffer~: Input buffer stack for file and macro streams. +- ~cur_token~, ~cur_run~, ~base_run~: Active token buffer and tokenrun tracking. +- ~keep_tokens~: Whether to preserve old tokens (e.g., for diagnostics). +- ~a_buff~, ~u_buff~, ~free_buffs~: Temporary memory allocation pools. + +** 1.2 Parsing and Directive State +- ~state~: General lexer state (includes ~in_directive~ flag). +- ~state.in_directive~: Boolean flag indicating whether the preprocessor is currently parsing a directive line. If ~true~, token behavior (e.g., whitespace and line continuation) may differ. +- ~directive~, ~directive_line~: Currently parsed directive and its location. +- ~directive_result~: Token synthesized by a directive (if any). + +** 1.3 Macro Context and Expansion +- ~context~, ~base_context~: Macro expansion call stack. +- ~top_most_macro_node~: Current top-level macro under expansion. +- ~about_to_expand_macro_p~: Indicates if a macro is about to expand. +- ~macro_buffer~, ~macro_buffer_len~: Buffers for rendering macro string forms. + +** 1.4 Include and File Lookup State +- ~quote_include~, ~bracket_include~, ~no_search_path~: Search paths. +- ~all_files~, ~main_file~: Linked list of all known input files. +- ~file_hash~, ~dir_hash~: Hashtables for file path caching. +- ~nonexistent_file_hash~: Optimizes negative lookup caching. +- ~seen_once_only~: Tracks ~#pragma once~ semantics. + +** 1.5 Character Set Conversion +- ~narrow_cset_desc~, ~utf8_cset_desc~, ~wide_cset_desc~, etc.: Converters for source to execution character encodings. + +** 1.6 Location Mapping and Source Positioning +- ~line_table~: GCC's =line_maps= structure for virtual location tracking. +- ~invocation_location~, ~main_loc~, ~forced_token_location~: Positional context for diagnostics, token creation. + +** 1.7 Miscellaneous Flags and Utilities +- ~quote_ignores_source_dir~: Include resolution behavior flag. +- ~counter~: Value of the ~__COUNTER__~ macro. +- ~out~: Output buffer for traditional preprocessing mode. +- ~savedstate~: Used for dependency tracking with precompiled headers. +- ~comments~: Optional comment capture buffer. + +* 2. Core Interface Functions +** 2.1 Token Retrieval +- ~cpp_get_token(pfile)~: Public interface for retrieving the next logical token. +- ~cpp_peek_token(pfile, N)~: Look ahead without consuming. +- ~cpp_get_token_1(pfile)~: Internal token fetch used during macro expansion. + +** 2.2 Macro Definition and Expansion +- ~_cpp_new_macro(pfile, cmk_macro, obstack_ptr)~: Allocate and initialize a new macro definition. +- ~_cpp_mark_macro_used(node)~: Mark a macro as having been used. +- ~replace_args(...)~: Expand and replace macro arguments (not used during directive handling). +- ~collect_args(...)~: Collects arguments for a function-like macro invocation. +- ~collect_single_argument(...)~: Parses one macro argument and handles token accumulation. +- ~cpp_arguments_ok(...)~: Checks argument count and matching for a macro invocation. +- ~set_arg_token(...)~: Sets or appends a token in an argument’s expansion list. + +** 2.3 Directive Handling Helpers +- ~_cpp_skip_rest_of_line(pfile)~: Skip trailing tokens after directive arguments. +- ~lex_macro_node(pfile)~: Specialized lexer for parsing macro names. + +** 2.4 File/Include Handling +- ~cpp_push_include(pfile, filename)~: Add a new include to the stack. +- ~cpp_find_include_file(...)~: Path search logic. + +** 2.5 Location Utilities +- ~cpp_token_location(token)~: Extracts a =location_t= from a token. +- ~linemap_add(...)~: Adds a mapping between logical and physical line/column. + +** 2.6 Miscellaneous +- ~cpp_warning_with_line(...)~, ~cpp_error_with_line(...)~: Emit diagnostics with location. +- ~cpp_lookup(pfile, name, length)~: Interns an identifier and returns a ~cpp_hashnode *~. +- ~NODE_NAME(node)~: Expands to the null-terminated name of a macro node. + +* 3. Usage Examples + +** 3.1 Defining a Macro from a Directive +#+BEGIN_SRC c +cpp_hashnode *node = lex_macro_node(pfile); +cpp_macro *macro = _cpp_new_macro(pfile, cmk_macro, _cpp_reserve_room(pfile, 0, sizeof(cpp_macro))); +macro->count = 1; +macro->exp.tokens[0] = make_number_token("42"); +node->type = NT_USER_MACRO; +node->value.macro = macro; +_cpp_mark_macro_used(node); +#+END_SRC + +** 3.2 Parsing a Directive With Two Arguments +#+BEGIN_SRC c +cpp_token *arg1 = cpp_get_token(pfile); +cpp_token *comma = cpp_get_token(pfile); +if (comma->type != CPP_COMMA) + cpp_error(pfile, CPP_DL_ERROR, "expected ',' after macro name"); +cpp_token *arg2 = cpp_get_token(pfile); +_cpp_skip_rest_of_line(pfile); +#+END_SRC + +** 3.3 Controlling Directive Context +#+BEGIN_SRC c +bool saved = pfile->state.in_directive; +pfile->state.in_directive = false; +assign_handler(pfile); +pfile->state.in_directive = saved; +#+END_SRC + +** 3.4 Tokenization and Location Debugging +#+BEGIN_SRC c +const cpp_token *tok = cpp_get_token(pfile); +location_t loc = tok->src_loc; +printf("token at line: %d\n", LOCATION_LINE(loc)); +#+END_SRC + +* 4. directive.cc extensions to the reader +- ~lex_macro_node(pfile)~: Returns a ~cpp_hashnode *~ for the next identifier, used for directives like ~#define~ or custom ones like ~#assign~. +- ~_cpp_skip_rest_of_line(pfile)~: Advances the token stream to the next physical line. +- ~cpp_error_with_line(...)~, ~cpp_warning_with_line(...)~: Used for directive diagnostics. +- ~cpp_lookup(pfile, name, length)~: Interns a name as a hashnode symbol. +- ~cpp_reader->directive_result~: Used to push a synthesized token result into the stream (e.g., for ~#include_next~). +- ~pfile->state.in_directive~: Must be manually toggled when directive code calls into macro infrastructure. +* 5. macro.cc extensions to the reader + +*** 4.2.1 collect_args(...) +Accumulates macro arguments for a function-like macro. Reads and segments the input stream into a series of ~macro_arg~ entries, tracking nesting of parentheses and token boundaries. + +*** 4.2.2 collect_single_argument(...) +Parses and collects one macro argument, terminating on a comma or closing paren. Used internally by ~collect_args~, but can be called separately for single-argument macro handling. + +*** 4.2.3 replace_args(...) +Performs full substitution of macro arguments into the macro body. Handles token pasting (~##~), stringification (~#~), and recursive macro expansion. + +*** 4.2.4 cpp_arguments_ok(...) +Checks whether the number of provided arguments matches the macro’s parameter list. Validates ~paramc~ and variadic status. + +*** 4.2.5 set_arg_token(...) +Helper to insert or append a token into a ~macro_arg~. Used when building argument streams in ~collect_single_argument~. + +These routines enable fine-grained control over macro behavior and can be selectively reused to simulate macro expansion at directive time (e.g., ~#assign~, ~#bind~, or macro templating extensions). +* 6. Conclusion +~cpp_reader~ is the heart of the preprocessor, acting as a unifying context for token streams, macro tables, buffer management, diagnostics, and parser state. Understanding and safely manipulating it is key to extending the preprocessor (e.g., adding new directives like ~#assign~) without destabilizing expansion or include logic. + +Use ~in_directive~, ~context~, and ~cur_token~ fields with care, and follow the established patterns in ~directives.cc~ and ~macro.cc~ to ensure consistent behavior across parse and expansion phases. diff --git "a/document\360\237\226\211/how_it_works/lexing.org" "b/document\360\237\226\211/how_it_works/lexing.org" new file mode 100644 index 0000000..a6bed25 --- /dev/null +++ "b/document\360\237\226\211/how_it_works/lexing.org" @@ -0,0 +1,230 @@ +#+TITLE: GCC libcpp Lexer: Structure, Usage, and Extension +#+AUTHOR: Caelus (OpenAI) and Thomas Walker Lynch +#+DATE: 2025-05-09 + +* Overview +The C preprocessor lexer (`lex.cc`) in GCC's `libcpp` is responsible for scanning raw source characters and emitting `cpp_token` structures. It is Unicode-aware, macro-sensitive, context-tracking, and supports multiple levels of token buffering. This lexer is both a general-purpose lexical analyzer and a specialized component for preprocessing. + +This document provides: +1. An architectural overview of how the lexer operates. +2. Guidance on how to interface with it (i.e., how to invoke, initialize, and consume it). +3. Examples demonstrating token flow and useful idioms. + +* 1. About the Lexer + +** 1.1 Services Provided +The lexer transforms a stream of characters into a stream of `cpp_token`s. It performs: +- UCN (Universal Character Name) expansion. +- Unicode normalization for identifiers. +- Detection of digraphs/trigraphs. +- Skipping of whitespace and comments. +- Classification into token types (`cpp_ttype`). +- Optional macro expansion (via higher-level coordination with macro subsystem). + +The function `_cpp_lex_token()` is the main entry point for lexing one token from the input stream. + +** 1.2 Token Types and Structures +Tokens are represented as `struct cpp_token`, which contains: +- `type`: token kind (from `cpp_ttype`) +- `val`: a union holding the value (e.g. number, string, identifier) +- `flags`: indicators such as `PREV_WHITE` or `DIGRAPH` +- `src_loc`: location for diagnostics +- `spelling`: optional cached spelling (may be recomputed) + +Auxiliary structures include: +- `cpp_hashnode`: interned identifiers and macro names +- `normalize_state`: for handling normalization and BiDi context +- `_cpp_buff`: dynamic buffers used for temporary token storage + +** 1.3 Unicode and Normalization +Lexer supports bidirectional Unicode enforcement using: +- `context`, `normalize_state`: track BiDi embeddings and UCN states +- `on_char`, `on_close`, `maybe_warn_bidi_on_close`: enforce structure + +** 1.4 Vectorized Fast Path +Several functions (e.g. `search_line_sse2`) accelerate scanning on x86 via SIMD. These are conditionally invoked from `search_line_fast` when alignment and CPU features allow. + +** 1.5 Token Buffers and Pools +Token buffers are managed using `_cpp_get_buff`, `_cpp_extend_buff`, `_cpp_commit_buff`, and `_cpp_release_buff`. These form a scratch/reuse pool and reduce allocations in macro processing or lexing multiple tokens rapidly. + +* 2. How to Use the Lexer API + +** 2.1 Initialization +Before lexing, the preprocessor must initialize its state: + +#+begin_src c +cpp_reader *pfile = cpp_create_reader(GTK_TESTING, NULL, NULL); +_cpp_init_lexer(pfile); +_cpp_init_tokenrun(pfile); +#+end_src + +** 2.2 Lexing Tokens +To retrieve the next token: + +#+begin_src c +const cpp_token *token = _cpp_lex_token(pfile); +#+end_src + +For directive-specific parsing (no macro expansion): + +#+begin_src c +cpp_token *token = _cpp_lex_direct(pfile); +#+end_src + +** 2.3 Token Inspection +Each token has type and value fields: + +#+begin_src c +if (token->type == CPP_NUMBER) { + printf("Numeric token: %s\n", cpp_spell_token(pfile, token)); +} +#+end_src + +** 2.4 Identifier Handling +Lex identifiers directly (e.g., for macro lookup): + +#+begin_src c +cpp_hashnode *node = _cpp_lex_identifier(pfile); +if (cpp_macro_p(node)) { + // Node is a macro +} +#+end_src + +** 2.5 Stringification and Output +To spell a token or output lines: + +#+begin_src c +unsigned char *text = cpp_token_as_text(pfile, token); +cpp_output_token(pfile, token, stdout); +#+end_src + +* 3. Examples and Advanced Use + +** 3.1 Simple Token Stream +Lex a stream from input and print token types: + +#+begin_src c +while (true) { + const cpp_token *tok = _cpp_lex_token(pfile); + if (tok->type == CPP_EOF) + break; + printf("Token: %s\n", cpp_type2name(tok->type)); +} +#+end_src + +** 3.2 Peeking and Lookahead +Use `cpp_peek_token` to look ahead: + +#+begin_src c +const cpp_token *next = cpp_peek_token(pfile); +if (next->type == CPP_OPEN_PAREN) + printf("Function call?\n"); +#+end_src + +** 3.3 Handling Unicode Identifiers +To support identifiers with UCNs: + +#+begin_src c +cpp_hashnode *ident = _cpp_lex_identifier(pfile); +const uchar *spell = _cpp_spell_ident_ucns(pfile, ident); +printf("Normalized: %s\n", spell); +#+end_src + +** 3.4 Example: Skipping Comments +Use `_cpp_skip_block_comment` or `skip_line_comment`: + +#+begin_src c +bool changed_line = _cpp_skip_block_comment(pfile); +if (changed_line) + _cpp_clean_line(pfile); +#+end_src + +** 3.5 Buffer Usage Examples + +*** 3.5.1 Allocate and Fill a Temporary Buffer +Use `_cpp_get_buff` to allocate a scratch buffer. Always check and ensure space before writing. Then commit the buffer and retrieve its contents. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +size_t len = 5; // Number of bytes to write + +// Ensure buffer has enough room +if ((size_t)(buff->limit - buff->cur) < len) + _cpp_extend_buff(pfile, &buff); + +// Write data safely +memcpy(buff->cur, "hello", len); +buff->cur += len; + +// Commit buffer and retrieve stable pointer +unsigned char *data = (unsigned char *) _cpp_commit_buff(pfile, buff, len); +printf("Buffer contents: %.*s\n", (int)len, data); +#+end_src +*** 3.5.2 Extend a Buffer Dynamically +Extend a buffer when you exceed its original size. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); + +// Simulate a long write +for (int i = 0; i < 300; ++i) { + if ((size_t)(buff->limit - buff->cur) < 1) { + _cpp_extend_buff(pfile, &buff); + } + *buff->cur++ = 'A'; +} + +unsigned char *text = (unsigned char *) _cpp_commit_buff(pfile, buff, 300); +printf("Expanded buffer: %.*s\n", 10, text); // First 10 chars +#+end_src + +*** 3.5.3 Use Buffers in Token Construction +Construct a macro expansion or synthetic token string. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +buff->cur = stpcpy((char *)buff->cur, "MY_MACRO("); +buff->cur = stpcpy((char *)buff->cur, "123 + 456"); +*buff->cur++ = ')'; + +unsigned char *macro_text = (unsigned char *) _cpp_commit_buff(pfile, buff, + buff->cur - buff->base); +printf("Token string: %s\n", macro_text); +#+end_src + +*** 3.5.4 Releasing a Buffer +After using a buffer temporarily (e.g., in lookahead), release it. + +#+begin_src c +_cpp_buff *buff = _cpp_get_buff(pfile); +// ... use the buffer ... +_cpp_release_buff(pfile, buff); +#+end_src + +*** 3.5.5 Commit and Reuse +After committing a buffer, you may allocate another for reuse: + +#+begin_src c +unsigned char *first = (unsigned char *) _cpp_commit_buff(pfile, buff, len); +_cpp_buff *next = _cpp_get_buff(pfile); +// next->base points to fresh or recycled memory +#+end_src + +* 4. Notes on Extension + +- You may insert a new directive (e.g., `#assign`) by defining it in `directives.cc` and adding handler logic in `macro.cc` or your own file. +- If you want to extend the lexer for new token kinds, you must: + - Add a new `cpp_ttype` enum value. + - Extend `_cpp_lex_token` or `lex_string` to recognize and classify it. + - Update `cpp_type2name` and spelling functions. + +* 5. Recommended Reading +- `libcpp/include/cpp-id-data.h`: For macro flags and token identifiers +- `libcpp/lex.cc`: Lexer core implementation +- `libcpp/directives.cc`: Directive parsing +- `libcpp/macro.cc`: Macro expansion +- `libcpp/line-map.cc`: Location tracking and diagnostics + + + + diff --git "a/document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" "b/document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" new file mode 100644 index 0000000..94c7a70 --- /dev/null +++ "b/document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" @@ -0,0 +1,78 @@ +#+TITLE: Toolchain Dependency Layers +#+AUTHOR: Thomas Walker Lynch +#+DATE: 2025-05-06 +#+OPTIONS: toc:nil num:nil +#+LANGUAGE: en + +* Purpose + +This document outlines the dependencies involved in building a standalone GCC toolchain. It compares two approaches: + +1. Using system-provided tools and headers to build GCC +2. Building a fully self-consistent standalone toolchain + +Understanding the bootstrap sequence is critical for modifying or reproducing GCC builds, especially when building in isolation. + +* The Story: Bootstrap Spiral + +So this programmer — he wanted to add a new directive to GCC. + +So he downloaded the GCC sources with the intent to make a modified, standalone copy. + +But to compile GCC, he needed standard C library headers — which meant downloading glibc. + +But to compile glibc, he needed a working C compiler. So he would first need a minimal GCC — stage 1. + +But to build that stage 1 GCC, he needed glibc headers. + +So he compiled the glibc headers first. + +Then he compiled stage 1 GCC. + +Then he compiled the full glibc. + +Then he compiled the full GCC. + +Ah, but to compile the glibc headers, he first needed the Linux kernel headers... + +There was an old lady who swallowed a fly. I don’t know why she... + +* Approach 1: System-Assisted Bootstrap + +This method uses the host system’s tools and headers to provide bootstrap support. It is simpler and faster, but not fully isolated. + +** Dependencies: + +- System-provided: + - C compiler (e.g. GCC) + - libc (headers and shared objects) + - binutils + - Linux kernel headers + +- Build Steps: + 1. Build binutils using system GCC + 2. Build GCC using system libc and headers + +** Characteristics: +- Fast +- Relies on host environment +- Not self-contained + +** Use Case: +- Building a local variant of GCC that will be used on the same system +- Development where purity or relocatability isn’t required + +* Approach 2: Fully Self-Consistent Toolchain + +This method builds every component of the toolchain in a clean directory, using only upstream sources. It isolates the build from host interference. + +** Dependencies: + +- Linux kernel headers (must be provided up front) +- Binutils source +- Glibc source +- GCC source + +** Build Sequence: + +1. Install Linux kernel headers → needed to build glib diff --git "a/document\360\237\226\211/source/cpp.org" "b/document\360\237\226\211/source/cpp.org" deleted file mode 100644 index b38c7ba..0000000 --- "a/document\360\237\226\211/source/cpp.org" +++ /dev/null @@ -1,503 +0,0 @@ -#+TITLE: C Preprocessor Overview -#+AUTHOR: Thomas Walker Lynch & Caelestis Index -#+DESCRIPTION: High-level architectural partitioning of cpp (GCC 12.x) -#+FILETAGS: cpp preprocessor architecture gcc -#+OPTIONS: toc:nil - -* Preprocessing Pipeline (Diagram) - -#+BEGIN_SRC text - C Preprocessor (cpp) - ===================== - -+----------------------+ -| Source Code | -+----------------------+ - | - v -+----------------------+ -| Lexical Analysis | <- Part of: Lexical Analysis -| (tokenize input) | -+----------------------+ - | - v -+----------------------+ -| Directive Engine | <- Part of: Directive Handling -| (#define, #if, etc.) | -+----------------------+ - | - v -+----------------------+ -| Conditional Logic | <- Part of: Conditional Compilation -| (#if/#ifdef/#else) | -+----------------------+ - | - v -+----------------------+ -| Macro Expansion | <- Part of: Macro Expansion -| (object/function) | -+----------------------+ - | - v -+----------------------+ -| Callback Hooks | <- Part of: Hook and Callback Interface -| (cpp_callbacks) | -+----------------------+ - | - v -+----------------------+ -| Output Tokens | <- Output stream to compiler frontend -| (to GCC parser) | -+----------------------+ -#+END_SRC - -Each block corresponds to a major processing stage in `cpp`. The functional groups defined earlier align to these blocks as indicated, though some (like state management and diagnostics) operate globally across the pipeline. - - - -* Major Functional Partitions of the C Preprocessor (cpp) - -This section outlines the primary architectural components of the C preprocessor as implemented in GCC 12.x. These functional partitions help frame how cpp processes input and how its internal modules interact. - -** 1. Lexical Analysis -- Tokenizes input into =cpp_token= streams. -- Decodes: - - UTF-8 characters - - Trigraphs (e.g., =??=) - - Digraphs (e.g., =<: = for =[=) -- Central structure: =cpp_lexer= -- Produces tokens for macro expansion and conditional evaluation. -** 2. Directive Handling -- Processes all =#= directives, including: - - =#define=, =#undef=, =#include=, =#line=, =#error=, =#pragma= - - Extended directives like =#assign=, =#call= if supported. -- Managed via =directive_table= and dispatch functions like =do_define=, =do_include=, etc. - -** 3. Conditional Compilation -- Handles constructs like: - - =#if=, =#ifdef=, =#ifndef=, =#elif=, =#else=, =#endif= -- Used to include or exclude code based on macro definitions and constant expressions. -- Driven by the =if_stack= in =cpp_reader=. -- Central to controlling variant builds, platform-specific code, or staged compilation. -** 4. File Inclusion and Search Paths -- Resolves =#include= and maintains include history. -- Handles: - - System vs user includes (<...> vs "..."). - - Include path resolution via =cpp_search_path=. - - File change tracking via =file_stack=. -** 5. Macro Expansion -- Handles object-like and function-like macros: - - =#define PI 3.14= - - =#define SQR(x) ((x)*(x))= -- Manages: - - Argument collection and expansion - - Token-pasting (=##=) and stringification (=#=) -- Involves =macro_table=, =collect_args=, and =expand_macro()= - -** 6. Diagnostics and Error Recovery -- Reports syntax errors, macro misuse, directive misuse. -- Uses: - - =cpp_error=, =cpp_warning=, =cpp_notice= - - Tracks macro nesting, input location, and file state for context. - -** 7. Hook and Callback Interface -- Interface: =cpp_callbacks= -- Allows frontend or plugin to observe: - - Macro definitions - - File changes - - Token output stream -- Enables debugging tools, IDEs, or language servers to integrate preprocessor awareness. - -** 8. State Management and Scoping -- Maintains global and file-level preprocessor state. -- Tracks: - - Nested conditional state via =if_stack= - - Macro table lifetimes and shadowing - - Include guards and =#pragma once= heuristics - - -* cpplib.h -- Application Interface Overview - -This section documents the **interface** and **in-memory model** of the C preprocessor (`libcpp`) from GCC 12.2.0. -It covers core data structures (tokens, macros, readers) and the primary functions for working with them. - -** Key Data Structures - -*** Token & Token Metadata -- `enum cpp_ttype` :: All possible token types (operators, names, literals, etc.) -- `struct cpp_token` :: Represents a token in the stream (with union-based payload) -- `enum cpp_token_fld_kind` :: Discriminates the active field in `cpp_token.val` -- `struct cpp_string` :: Raw string representation with length and pointer - -*** Macros & Identifiers -- `struct cpp_macro` :: Describes macro kind, parameter list, and token expansion -- `enum cpp_macro_kind` :: ISO-style, traditional-style, and assertion macros -- `struct cpp_identifier` :: Canonical and original spellings of a name -- `struct cpp_macro_arg` :: Argument number and spelling for macro arguments - -*** Symbol Table -- `struct cpp_hashnode` :: Hash table node for identifiers/macros -- `enum node_type` :: Distinguishes macro types (arg/user/builtin) -- `union _cpp_hashnode_value` :: Payload (macro, arg index, etc.) -- `enum cpp_builtin_type` :: Reserved built-ins like `__LINE__`, `__FILE__`, `_Pragma` - -*** Reader & Configuration -- `struct cpp_reader` :: Forward-declared. Central structure for preprocessing. -- `struct cpp_options` :: Stores all language mode flags, warning flags, and feature toggles. -- `struct cpp_callbacks` :: Client hook interface for diagnostic, macro, and file events. -- `struct cpp_dir` :: Represents an `#include` search directory. - -*** Numerics -- `struct cpp_num` :: Two-part 64-bit integer (high, low), overflow flags -- `cpp_classify_number` :: Categorizes radix/type (e.g., `0x`, `u`, `LL`) -- Defines :: `CPP_N_*` classify bits (INTEGER, FLOATING, WIDTH, RADIX, SUFFIX) - -*** Charset Handling -- `typedef cppchar_t` :: 32-bit safe character representation -- `struct cpp_decoded_char` :: Result of UTF-8 decoding step -- `struct cpp_char_column_policy` :: Visual column handling for diagnostics -- `class cpp_display_width_computation` :: Converts UTF-8 sequence to visual width - -*** Comment Tracking -- `struct cpp_comment`, `cpp_comment_table` :: Captures all parsed comments (if enabled) - -** Core Functions - -*** Lifecycle & Reader Setup -- `cpp_create_reader(enum c_lang, ...)` :: Allocates and initializes `cpp_reader` -- `cpp_finish`, `cpp_destroy` :: Finalize and free the reader -- `cpp_post_options` :: Commit option changes after parsing flags - -*** Preprocessing Input -- `cpp_read_main_file` :: Begin reading and preprocessing a source file -- `cpp_get_token()` :: Fetch next token from stream -- `cpp_peek_token()` :: Peek ahead without consuming -- `cpp_backup_tokens()` :: Push tokens back for re-parsing -- `cpp_retrofit_as_include()` :: Treat main file as if included - -*** Macro System -- `cpp_define()`, `cpp_define_unused()`, `cpp_define_lazily()` :: Define macros -- `cpp_macro_definition()` :: Dump macro body as string -- `cpp_compare_macros()` :: Deep compare two macros -- `cpp_undef()`, `cpp_undef_all()` :: Remove macro(s) -- `cpp_set_deferred_macro()`, `cpp_get_deferred_macro()` :: Lazy macro substitution - -*** Symbol Lookup -- `cpp_lookup()` :: Lookup or create an identifier hashnode -- `cpp_forall_identifiers()` :: Iterate over all identifiers - -*** String & Char Evaluation -- `cpp_interpret_charconst()` :: Parse a character constant (e.g. `'a'`) -- `cpp_interpret_string()` :: Parse string literal(s) into `cpp_string` -- `cpp_interpret_integer()` :: Parse numeric token into `cpp_num` - -*** Diagnostics -- `cpp_error()`, `cpp_warning()`, `cpp_pedwarning()` :: General messages -- `cpp_error_at()` :: Message with source location (rich_location optional) -- `cpp_errno()` / `cpp_errno_filename()` :: Errors based on `errno` -- `cpp_warning_with_line()` :: Fallback location-based warnings -- `cpp_get_callbacks()` / `cpp_set_callbacks()` :: Manage diagnostic hooks - -*** Extension Hooks & Pragma -- `cpp_register_pragma()` :: Register custom `#pragma` handler -- `cpp_get_callbacks()` :: Access to client-supplied hook table -- `cpp_define_formatted()` :: Macro with `printf`-style input -- `cpp_directive_only_process()` :: Run directive-only logic on a token stream - -*** Includes & File Management -- `cpp_set_include_chains()` :: Set system and user include paths -- `cpp_push_buffer()` :: Manually push a buffer for parsing -- `cpp_included()`, `cpp_included_before()` :: Has this file been included? -- `cpp_get_converted_source()` :: Read a file in input charset, return decoded buffer - -** Token Types (cpp_ttype) - -A full enumeration of all tokens in the preprocessor: -- Operators: `CPP_PLUS`, `CPP_MINUS`, `CPP_EQ_EQ`, etc. -- Punctuation: `CPP_OPEN_PAREN`, `CPP_HASH`, `CPP_SEMICOLON` -- Literals: `CPP_STRING`, `CPP_WCHAR`, `CPP_NUMBER` -- Special: `CPP_MACRO_ARG`, `CPP_PRAGMA`, `CPP_EOF` - -Each token has: -- Type (`enum cpp_ttype`) -- Flags (`PREV_WHITE`, `DIGRAPH`, `NO_EXPAND`, etc.) -- Source location -- Union payload (e.g., string, macro arg, hashnode) - -** Interface Concepts Beyond Code -*** Unicode Handling -- Input is normalized per `cpp_normalize_level` -- UTF-8 is expanded into 32-bit code points (`cppchar_t`) -- Display width of characters is estimated for diagnostics -- Bidi (bidirectional) controls are optionally scanned/warned - -*** Client Extension Hooks -- Most preprocessing operations (macro use, `#include`, comments, errors) are callback-hooked -- Used by GCC frontend to track macro use, implement diagnostics, and guide `#pragma` processing - -*** Dependency Generation -- `cpp_finish()` accepts an output stream for dependency info -- Options control whether main file is included, phony targets are added, etc. - -** Summary - -`cpplib.h` serves as both API contract and internal representation guide. -- It offers a high-fidelity view of source tokens for later compiler stages. -- The entire macro system, character encoding, and diagnostic lifecycle are managed through this interface. - - - - -* Callback Hooks (cpp_callbacks) - -The `cpp_callbacks` struct in `cpplib.h` allows external consumers (e.g., GCC frontend, IDE integrations, or plugins) to receive notifications during preprocessing. Each function pointer in this struct represents a hookable event. - -** Overview - -Hooks are triggered at specific stages: -- After macro definition or undefinition -- Before and after file inclusion -- When tokens are emitted -- Upon encountering diagnostics -- During comment scanning (if enabled) -- On encountering special directives (e.g., `#pragma`) - -** Hook Structure - -#+BEGIN_SRC c -struct cpp_callbacks { - void (*define)(cpp_reader *, source_location, const cpp_hashnode *); - void (*undef)(cpp_reader *, source_location, const cpp_hashnode *); - void (*include)(cpp_reader *, const char *filename, int angle_brackets); - void (*file_change)(cpp_reader *, const struct line_map *); - void (*line_change)(cpp_reader *, source_location, int to_file, int to_line); - void (*ident)(cpp_reader *, const cpp_string *); - void (*invalid_directive)(cpp_reader *); - void (*def_pragma)(cpp_reader *, const cpp_token *); - void (*cb_comment)(cpp_reader *, const cpp_token *); -}; -#+END_SRC - -Each callback receives either a pointer to the `cpp_reader`, the affected token or structure, and optional contextual data. - ---- - -** `define` - -*** Trigger -- Fired immediately after a macro is defined with `#define`. - -*** Parameters -- `cpp_reader *pfile`: global preprocessor state (read-write). -- `source_location loc`: location of the `#define`. -- `const cpp_hashnode *node`: the macro name and metadata (read-only in this context). - -*** Semantics -- The `cpp_hashnode` holds the macro's name and a pointer to its `cpp_macro` definition. -- Modifying the macro at this point is possible but discouraged. Use `cpp_undef()` + `cpp_define()` instead if redefinition is needed. - -*** Uses -- GCC uses this to update dependency tracking and debug tables. -- Tools may track macro definitions, emit logs, or enforce naming policies. - ---- - -** `undef` - -*** Trigger -- Fired after `#undef` removes a macro. - -*** Parameters -- Same as `define`. - -*** Semantics -- The node is marked `undefined`, but the symbol remains in the hash table. -- No mutation should occur—only inspection or logging. - -*** Uses -- Enables reversal tracking or macro scoping analysis. - ---- - -** `include` - -*** Trigger -- Fired just before a file is opened via `#include`. - -*** Parameters -- `cpp_reader *pfile` -- `const char *filename`: string from the include directive (not normalized). -- `int angle_brackets`: nonzero for `<...>`, zero for `"..."`. - -*** Semantics -- Purely informational; does not affect include search or suppression. -- The filename is unverified and not guaranteed to exist. - -*** Uses -- IDEs and build tools use this to build include graphs. -- LSPs use it to track file references and symbol origins. - ---- - -** `file_change` - -*** Trigger -- Called when the active input file changes (entry or exit of `#include`). - -*** Parameters -- `cpp_reader *pfile` -- `const struct line_map *map`: describes the current file's location and context. - -*** Semantics -- `line_map` gives full access to file/line/column mapping. -- This structure is read-only; mutating it will corrupt diagnostics and tokenization. - -*** Uses -- Debug info (DWARF line tables), logging, stack-based include tracking. - ---- - -** `line_change` - -*** Trigger -- Fired on `#line` directives or line-mapping transitions. - -*** Parameters -- `cpp_reader *pfile` -- `source_location loc`: location in input stream. -- `int to_file`: non-zero if a new file name is being used. -- `int to_line`: new logical line number. - -*** Semantics -- Use this to remap locations or re-synchronize overlays. -- These values are inputs to the line map; do not write back. - -*** Uses -- Used in DWARF debug info to support accurate line-based breakpoints. - ---- - -** `ident` - -*** Trigger -- Called when a `#ident` directive is parsed. - -*** Parameters -- `cpp_reader *pfile` -- `const cpp_string *text`: payload of the identifier message. - -*** Semantics -- Informational only. Common in legacy systems or codegen traces. - -*** Uses -- Collect module identity, versioning hints, or logmarks. - ---- - -** `invalid_directive` - -*** Trigger -- Fired when an unrecognized or malformed directive is encountered. - -*** Parameters -- `cpp_reader *pfile` - -*** Semantics -- Hook has no extra context; use `cpp_get_token()` to recover. -- Hook may trigger fallback behavior or custom directive logic. - -*** Uses -- Used in `-fpreprocessed` mode to suppress diagnostics. -- External tools can use this to extend the directive set. - ---- - -** `def_pragma` - -*** Trigger -- Fired when a `#pragma` directive is parsed. - -*** Parameters -- `cpp_reader *pfile` -- `const cpp_token *pragma`: token stream beginning with `CPP_PRAGMA`. - -*** Semantics -- Read-only access to token stream. -- Mutation possible via `cpp_push_buffer()` to inject expanded tokens. - -*** Uses -- GCC plugins hook this to implement custom `#pragma` behavior. -- Can trigger front-end features (like `#pragma GCC diagnostic`). - ---- - -** `cb_comment` - -*** Trigger -- Optional. Enabled if comment tracking is requested. - -*** Parameters -- `cpp_reader *pfile` -- `const cpp_token *comment`: holds text of comment. - -*** Semantics -- Only line/block comment content is captured, not semantics. -- Read-only token; do not mutate token payload. - -*** Uses -- Used by source-to-source translators and formatters. -- Some static analyzers inspect comments for hints or disables. - ---- - -** Summary - -The `cpp_callbacks` interface enables observational and limited transformational interaction with the preprocessor pipeline. - -- Most parameters are read-only or shallow copies. -- For transformations, prefer using `cpp_define()`, `cpp_push_buffer()`, or `cpp_backup_tokens()` externally. -- Internal structures like `cpp_reader`, `cpp_token`, and `cpp_macro` should not be mutated unless explicitly permitted. - - - -* Plugin-Like Integration in libcpp - -Unlike the main GCC compiler, which supports a formal plugin system (`gcc-plugin.h`), `libcpp` (the C preprocessor library) does *not* support plugins in the dynamic or runtime-loaded sense. There is no system for loading shared libraries, registering handlers via symbols, or extending preprocessor behavior through runtime modules. - -** Static Hook Interface via cpp_callbacks - -Instead, `libcpp` exposes a *statically defined interface* (`struct cpp_callbacks`) for embedding applications to receive notifications of preprocessor events. These include: - -- Macro definitions and undefinitions -- Source file entry/exit -- Comment and pragma parsing -- Token emission and buffer transitions - -An embedding client (such as GCC's C/C++ frontend, or a third-party tool using libcpp) may assign function pointers directly into this struct during reader setup. - -#+BEGIN_SRC c -cpp_reader *r = cpp_create_reader(...); -cpp_callbacks *cb = cpp_get_callbacks(r); -cb->macro_defined = my_macro_handler; -cb->file_change = my_file_tracker; -#+END_SRC - -This pattern is analogous to a *plugin interface*, but all logic is statically linked at compile time. - -** Mutability and Access Scope - -The callback interface is primarily **observational**—that is, hooks are expected to inspect events, not mutate the `cpp_reader` state directly. However, advanced users can, with care, reach into the data structures passed to them (e.g., `cpp_macro`, `cpp_hashnode`) and affect behavior, though this is neither documented nor officially supported. - -In summary: - -| Feature | GCC Frontend Plugin | libcpp Callback Interface | -|--------------------------+---------------------+----------------------------| -| Dynamically loadable | Yes | No | -| Runtime extension API | Yes (`gcc-plugin.h`) | No | -| Assign custom handlers | Yes | Yes (via `cpp_callbacks`) | -| Mutate core structures | With care | With care (not endorsed) | -| Stability across versions| Best-effort | Internal API, may break | - -** Recommendation - -Use `cpp_callbacks` as a read-only interface to monitor preprocessing behavior. If deeper mutation or instrumentation is required, consider modifying or forking `libcpp` itself. There is currently no officially supported way to extend it at runtime. diff --git "a/document\360\237\226\211/source/cpp_reader.org" "b/document\360\237\226\211/source/cpp_reader.org" deleted file mode 100644 index bc87d15..0000000 --- "a/document\360\237\226\211/source/cpp_reader.org" +++ /dev/null @@ -1,147 +0,0 @@ -#+TITLE: cpp_reader: Preprocessor State and Interface Guide -#+AUTHOR: Caelestis Index -#+FILETAGS: cpp, GCC internals, preprocessor, architecture - -* Overview -The =cpp_reader= struct in GCC's =libcpp= encapsulates the complete state of a single C preprocessor session. It governs token input, macro expansion, directive parsing, include stack management, and source map resolution. It is the central state object passed through nearly all parts of the C preprocessor. - -* 1. State Data - -** 1.1 Buffer and Lexing State -- ~buffer~, ~overlaid_buffer~: Input buffer stack for file and macro streams. -- ~cur_token~, ~cur_run~, ~base_run~: Active token buffer and tokenrun tracking. -- ~keep_tokens~: Whether to preserve old tokens (e.g., for diagnostics). -- ~a_buff~, ~u_buff~, ~free_buffs~: Temporary memory allocation pools. - -** 1.2 Parsing and Directive State -- ~state~: General lexer state (includes ~in_directive~ flag). -- ~state.in_directive~: Boolean flag indicating whether the preprocessor is currently parsing a directive line. If ~true~, token behavior (e.g., whitespace and line continuation) may differ. -- ~directive~, ~directive_line~: Currently parsed directive and its location. -- ~directive_result~: Token synthesized by a directive (if any). - -** 1.3 Macro Context and Expansion -- ~context~, ~base_context~: Macro expansion call stack. -- ~top_most_macro_node~: Current top-level macro under expansion. -- ~about_to_expand_macro_p~: Indicates if a macro is about to expand. -- ~macro_buffer~, ~macro_buffer_len~: Buffers for rendering macro string forms. - -** 1.4 Include and File Lookup State -- ~quote_include~, ~bracket_include~, ~no_search_path~: Search paths. -- ~all_files~, ~main_file~: Linked list of all known input files. -- ~file_hash~, ~dir_hash~: Hashtables for file path caching. -- ~nonexistent_file_hash~: Optimizes negative lookup caching. -- ~seen_once_only~: Tracks ~#pragma once~ semantics. - -** 1.5 Character Set Conversion -- ~narrow_cset_desc~, ~utf8_cset_desc~, ~wide_cset_desc~, etc.: Converters for source to execution character encodings. - -** 1.6 Location Mapping and Source Positioning -- ~line_table~: GCC's =line_maps= structure for virtual location tracking. -- ~invocation_location~, ~main_loc~, ~forced_token_location~: Positional context for diagnostics, token creation. - -** 1.7 Miscellaneous Flags and Utilities -- ~quote_ignores_source_dir~: Include resolution behavior flag. -- ~counter~: Value of the ~__COUNTER__~ macro. -- ~out~: Output buffer for traditional preprocessing mode. -- ~savedstate~: Used for dependency tracking with precompiled headers. -- ~comments~: Optional comment capture buffer. - -* 2. Core Interface Functions -** 2.1 Token Retrieval -- ~cpp_get_token(pfile)~: Public interface for retrieving the next logical token. -- ~cpp_peek_token(pfile, N)~: Look ahead without consuming. -- ~cpp_get_token_1(pfile)~: Internal token fetch used during macro expansion. - -** 2.2 Macro Definition and Expansion -- ~_cpp_new_macro(pfile, cmk_macro, obstack_ptr)~: Allocate and initialize a new macro definition. -- ~_cpp_mark_macro_used(node)~: Mark a macro as having been used. -- ~replace_args(...)~: Expand and replace macro arguments (not used during directive handling). -- ~collect_args(...)~: Collects arguments for a function-like macro invocation. -- ~collect_single_argument(...)~: Parses one macro argument and handles token accumulation. -- ~cpp_arguments_ok(...)~: Checks argument count and matching for a macro invocation. -- ~set_arg_token(...)~: Sets or appends a token in an argument’s expansion list. - -** 2.3 Directive Handling Helpers -- ~_cpp_skip_rest_of_line(pfile)~: Skip trailing tokens after directive arguments. -- ~lex_macro_node(pfile)~: Specialized lexer for parsing macro names. - -** 2.4 File/Include Handling -- ~cpp_push_include(pfile, filename)~: Add a new include to the stack. -- ~cpp_find_include_file(...)~: Path search logic. - -** 2.5 Location Utilities -- ~cpp_token_location(token)~: Extracts a =location_t= from a token. -- ~linemap_add(...)~: Adds a mapping between logical and physical line/column. - -** 2.6 Miscellaneous -- ~cpp_warning_with_line(...)~, ~cpp_error_with_line(...)~: Emit diagnostics with location. -- ~cpp_lookup(pfile, name, length)~: Interns an identifier and returns a ~cpp_hashnode *~. -- ~NODE_NAME(node)~: Expands to the null-terminated name of a macro node. - -* 3. Usage Examples - -** 3.1 Defining a Macro from a Directive -#+BEGIN_SRC c -cpp_hashnode *node = lex_macro_node(pfile); -cpp_macro *macro = _cpp_new_macro(pfile, cmk_macro, _cpp_reserve_room(pfile, 0, sizeof(cpp_macro))); -macro->count = 1; -macro->exp.tokens[0] = make_number_token("42"); -node->type = NT_USER_MACRO; -node->value.macro = macro; -_cpp_mark_macro_used(node); -#+END_SRC - -** 3.2 Parsing a Directive With Two Arguments -#+BEGIN_SRC c -cpp_token *arg1 = cpp_get_token(pfile); -cpp_token *comma = cpp_get_token(pfile); -if (comma->type != CPP_COMMA) - cpp_error(pfile, CPP_DL_ERROR, "expected ',' after macro name"); -cpp_token *arg2 = cpp_get_token(pfile); -_cpp_skip_rest_of_line(pfile); -#+END_SRC - -** 3.3 Controlling Directive Context -#+BEGIN_SRC c -bool saved = pfile->state.in_directive; -pfile->state.in_directive = false; -assign_handler(pfile); -pfile->state.in_directive = saved; -#+END_SRC - -** 3.4 Tokenization and Location Debugging -#+BEGIN_SRC c -const cpp_token *tok = cpp_get_token(pfile); -location_t loc = tok->src_loc; -printf("token at line: %d\n", LOCATION_LINE(loc)); -#+END_SRC - -* 4. directive.cc extensions to the reader -- ~lex_macro_node(pfile)~: Returns a ~cpp_hashnode *~ for the next identifier, used for directives like ~#define~ or custom ones like ~#assign~. -- ~_cpp_skip_rest_of_line(pfile)~: Advances the token stream to the next physical line. -- ~cpp_error_with_line(...)~, ~cpp_warning_with_line(...)~: Used for directive diagnostics. -- ~cpp_lookup(pfile, name, length)~: Interns a name as a hashnode symbol. -- ~cpp_reader->directive_result~: Used to push a synthesized token result into the stream (e.g., for ~#include_next~). -- ~pfile->state.in_directive~: Must be manually toggled when directive code calls into macro infrastructure. -* 5. macro.cc extensions to the reader - -*** 4.2.1 collect_args(...) -Accumulates macro arguments for a function-like macro. Reads and segments the input stream into a series of ~macro_arg~ entries, tracking nesting of parentheses and token boundaries. - -*** 4.2.2 collect_single_argument(...) -Parses and collects one macro argument, terminating on a comma or closing paren. Used internally by ~collect_args~, but can be called separately for single-argument macro handling. - -*** 4.2.3 replace_args(...) -Performs full substitution of macro arguments into the macro body. Handles token pasting (~##~), stringification (~#~), and recursive macro expansion. - -*** 4.2.4 cpp_arguments_ok(...) -Checks whether the number of provided arguments matches the macro’s parameter list. Validates ~paramc~ and variadic status. - -*** 4.2.5 set_arg_token(...) -Helper to insert or append a token into a ~macro_arg~. Used when building argument streams in ~collect_single_argument~. - -These routines enable fine-grained control over macro behavior and can be selectively reused to simulate macro expansion at directive time (e.g., ~#assign~, ~#bind~, or macro templating extensions). -* 6. Conclusion -~cpp_reader~ is the heart of the preprocessor, acting as a unifying context for token streams, macro tables, buffer management, diagnostics, and parser state. Understanding and safely manipulating it is key to extending the preprocessor (e.g., adding new directives like ~#assign~) without destabilizing expansion or include logic. - -Use ~in_directive~, ~context~, and ~cur_token~ fields with care, and follow the established patterns in ~directives.cc~ and ~macro.cc~ to ensure consistent behavior across parse and expansion phases. diff --git "a/document\360\237\226\211/tool_chain_dependency_layers.org" "b/document\360\237\226\211/tool_chain_dependency_layers.org" deleted file mode 100644 index 94c7a70..0000000 --- "a/document\360\237\226\211/tool_chain_dependency_layers.org" +++ /dev/null @@ -1,78 +0,0 @@ -#+TITLE: Toolchain Dependency Layers -#+AUTHOR: Thomas Walker Lynch -#+DATE: 2025-05-06 -#+OPTIONS: toc:nil num:nil -#+LANGUAGE: en - -* Purpose - -This document outlines the dependencies involved in building a standalone GCC toolchain. It compares two approaches: - -1. Using system-provided tools and headers to build GCC -2. Building a fully self-consistent standalone toolchain - -Understanding the bootstrap sequence is critical for modifying or reproducing GCC builds, especially when building in isolation. - -* The Story: Bootstrap Spiral - -So this programmer — he wanted to add a new directive to GCC. - -So he downloaded the GCC sources with the intent to make a modified, standalone copy. - -But to compile GCC, he needed standard C library headers — which meant downloading glibc. - -But to compile glibc, he needed a working C compiler. So he would first need a minimal GCC — stage 1. - -But to build that stage 1 GCC, he needed glibc headers. - -So he compiled the glibc headers first. - -Then he compiled stage 1 GCC. - -Then he compiled the full glibc. - -Then he compiled the full GCC. - -Ah, but to compile the glibc headers, he first needed the Linux kernel headers... - -There was an old lady who swallowed a fly. I don’t know why she... - -* Approach 1: System-Assisted Bootstrap - -This method uses the host system’s tools and headers to provide bootstrap support. It is simpler and faster, but not fully isolated. - -** Dependencies: - -- System-provided: - - C compiler (e.g. GCC) - - libc (headers and shared objects) - - binutils - - Linux kernel headers - -- Build Steps: - 1. Build binutils using system GCC - 2. Build GCC using system libc and headers - -** Characteristics: -- Fast -- Relies on host environment -- Not self-contained - -** Use Case: -- Building a local variant of GCC that will be used on the same system -- Development where purity or relocatability isn’t required - -* Approach 2: Fully Self-Consistent Toolchain - -This method builds every component of the toolchain in a clean directory, using only upstream sources. It isolates the build from host interference. - -** Dependencies: - -- Linux kernel headers (must be provided up front) -- Binutils source -- Glibc source -- GCC source - -** Build Sequence: - -1. Install Linux kernel headers → needed to build glib diff --git "a/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" new file mode 100644 index 0000000..21cd614 --- /dev/null +++ "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" @@ -0,0 +1,48 @@ +bool +cpp_get_paren_balanced_chars(cpp_reader *pfile) +{ + const unsigned char *cur = pfile->buffer->cur; + const unsigned char *rlimit = pfile->buffer->rlimit; + cpp_buffer *buffer = pfile->buffer; + + // Skip leading whitespace + while (cur < rlimit && ISSPACE(*cur)) + cur++; + + if (cur == rlimit || *cur != '(') + { + cpp_error(pfile, CPP_DL_ERROR, "expected opening parenthesis for macro body"); + return false; + } + + int depth = 0; + const unsigned char *scan = cur; + while (scan < rlimit) + { + if (*scan == '(') + depth++; + else if (*scan == ')') + { + depth--; + if (depth == 0) + { + // Copy from opening ( to matching ) inclusive + size_t len = scan - cur + 1; + unsigned char *copy = (unsigned char *)_cpp_unaligned_alloc(pfile, len + 1); + memcpy(copy, cur, len); + copy[len] = '\0'; + + // Point lexer buffer to just this region + buffer->cur = copy; + buffer->rlimit = copy + len; + buffer->next_line = NULL; // Signals EOF to lexer + return true; + } + } + scan++; + } + + // If we got here, closing paren was never found + cpp_error(pfile, CPP_DL_ERROR, "unclosed parenthesis in macro body"); + return false; +} diff --git "a/script_gcc_min-12\360\237\226\211/macro.cc" "b/script_gcc_min-12\360\237\226\211/macro.cc" index 82d8b4b..d491054 100644 --- "a/script_gcc_min-12\360\237\226\211/macro.cc" +++ "b/script_gcc_min-12\360\237\226\211/macro.cc" @@ -4138,7 +4138,7 @@ cpp_macro_definition (cpp_reader *pfile, cpp_hashnode *node, // see directives.cc extern const char *cpp_token_as_text(const cpp_token *token); -// a helper function for probing where we are at in the parse +// a helper function for probing where the parser thinks it is in the source void debug_peek_token (cpp_reader *pfile) { @@ -4156,22 +4156,48 @@ debug_peek_token (cpp_reader *pfile) _cpp_backup_tokens(pfile, 1); } +// collects the body of a #define or related directive static bool -collect_macro_body_tokens (cpp_reader *pfile, - cpp_macro *macro, - unsigned int *num_extra_tokens_out, - const char *paste_op_error_msg) -{ +collect_body_tokens( + cpp_reader *pfile + ,cpp_macro *macro + ,unsigned int *num_extra_tokens_out + ,const char *paste_op_error_msg + ,bool paren_matching +){ bool following_paste_op = false; unsigned int num_extra_tokens = 0; + int paren_depth; + cpp_token *token; + + if(paren_matching){ + token = _cpp_lex_direct(pfile); + if(token->type != CPP_OPEN_PAREN){ + cpp_error_with_line( + pfile + ,CPP_DL_ERROR + ,token->src_loc + ,0 + ,"expected body delimiter '(', but found: %s" + ,cpp_token_as_text(token) + ); + fprintf(stderr, "exiting collect_body_tokens did not find opening paren\n"); + return false; + } + paren_depth = 1; + fprintf( stderr, "entry paren_depth: %d\n", paren_depth); + } for (vaopt_state vaopt_tracker (pfile, macro->variadic, NULL);; ) { - cpp_token *token = NULL; - + // gets a token + // first parses token onto `macro->exp.tokens[macro->count]` + // then pulls the token off of `macro->exp.tokens[macro->count]` macro = lex_expansion_token(pfile, macro); token = ¯o->exp.tokens[macro->count++]; + fprintf( stderr, "top of loop, read token %s\n", cpp_token_as_text(token) ); + // recognize macro args, give them type CPP_MACRO_ARG if (macro->count > 1 && token[-1].type == CPP_HASH && macro->fun_like) { if (token->type == CPP_MACRO_ARG @@ -4193,27 +4219,57 @@ collect_macro_body_tokens (cpp_reader *pfile, { cpp_error(pfile, CPP_DL_ERROR, "'#' is not followed by a macro parameter"); + fprintf(stderr, "exiting collect_body_tokens not a macro arg and language is not ASM\n"); return false; } } - if (token->type == CPP_EOF) - { - if (following_paste_op) - { - cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); - return false; - } - if (!vaopt_tracker.completed()) + // parentheses matching overhead + if(paren_matching){ + if( token->type == CPP_OPEN_PAREN || token->type == CPP_CLOSE_PAREN){ + if(token->type == CPP_OPEN_PAREN) paren_depth++; + if(token->type == CPP_CLOSE_PAREN) paren_depth--; + fprintf( stderr, "new paren_depth: %d\n", paren_depth); + } + + if(token->type == CPP_EOF){ + fprintf(stderr, "Found CPP_EOF at paren depth %d\n", paren_depth); + macro->count--; + if(!_cpp_get_fresh_line(pfile)){ + fprintf(stderr, "exiting collect_body_tokens _cpp_get_fresh_line failed\n"); return false; - break; + } + fprintf(stderr, "Found CPP_EOF at depth %d read new line now continuing loop \n", paren_depth); + continue; } + } + + // exit loop at the end of the macro body + if( + paren_matching && paren_depth == 0 + || !paren_matching && token->type == CPP_EOF + ){ + fprintf(stderr, "exiting macro body collect loops\n"); + if(following_paste_op){ + cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); + fprintf( stderr, "exiting collect_body_tokens due to following_past_op\n"); + return false; + } + if( !vaopt_tracker.completed() ){ + fprintf( stderr, "exiting collect_body_tokens due to !vaopt_tracker.completed()\n"); + return false; + } + *num_extra_tokens_out = num_extra_tokens; + macro->count--; // drop the terminator + return true; + } if (token->type == CPP_PASTE) { if (macro->count == 1) { cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg); + fprintf( stderr, "exiting collect_body_tokens paste event\n"); return false; } @@ -4233,18 +4289,18 @@ collect_macro_body_tokens (cpp_reader *pfile, } following_paste_op = true; } - else + else{ following_paste_op = false; + } - if (vaopt_tracker.update(token) == vaopt_state::ERROR) + if (vaopt_tracker.update(token) == vaopt_state::ERROR){ + fprintf( stderr, "exiting collect_body_token due to vaopt_tracker.update(token) == vaopt_state::ERROR\n"); return false; + } } - *num_extra_tokens_out = num_extra_tokens; - return true; } - //-------------------------------------------------------------------------------- // for `#macro` directive /* @@ -4271,6 +4327,9 @@ create_iso_macro (cpp_reader *pfile) bool ok = false; cpp_macro *macro = NULL; +int saved_in_directive = pfile->state.in_directive; +int saved = pfile->keep_tokens; + /* -Saves token allocation address held in pfile->cur_token. -Gives a new token allocation address to pfile->cur_token, that of cpp_token first. @@ -4330,16 +4389,35 @@ create_iso_macro (cpp_reader *pfile) macro->parm.params = params; macro->fun_like = true; - // collects from pfile the tokens that constitute the macro body - if (!collect_macro_body_tokens(pfile, macro, &num_extra_tokens, paste_op_error_msg)) - goto out; + /* + Collect the macro body tokens. + A #macro () body is delineated by parentheses + */ + + +pfile->state.in_directive = 0; // allow fresh lines +pfile->keep_tokens = 1; + + // collects the remaining body tokens + if( + !collect_body_tokens( + pfile + ,macro + ,&num_extra_tokens + ,paste_op_error_msg + ,true + ) + ) goto out; + +pfile->keep_tokens = saved; +pfile->state.in_directive = saved_in_directive; // restore + + + // At this point, even if the body parse fails, we will say we made a macro. I'm not sure why as we haven't commited it yet, but this is what is in the code. Apparently we throw away the macro if the body does not parse. ok = true; - /* Don't count the CPP_EOF. */ - macro->count--; - // commit the cpp struct to memory // the struct reserves space for one token, the others run off the end macro = (cpp_macro *)_cpp_commit_buff(