From ff4c5f0e9de538687d9ea6f3f7c9289183652894 Mon Sep 17 00:00:00 2001
From: Thomas Walker Lynch <eknp9n@reasoningtechnology.com>
Date: Sat, 10 May 2025 04:41:39 -0700
Subject: [PATCH] working #macro directive, albeit still with embedded test
 messages

---
 .../how_it_works/cpp.org"                     |   0
 .../how_it_works/cpp_reader.org"              |   0
 .../how_it_works/lexing.org"                  | 230 ++++++++++++++++++
 .../tool_chain_dependency_layers.org"         |   0
 .../get_paren_balanced_chars.cc"              |  48 ++++
 "script_gcc_min-12\360\237\226\211/macro.cc"  | 134 +++++++---
 6 files changed, 384 insertions(+), 28 deletions(-)
 rename "document\360\237\226\211/source/cpp.org" => "document\360\237\226\211/how_it_works/cpp.org" (100%)
 rename "document\360\237\226\211/source/cpp_reader.org" => "document\360\237\226\211/how_it_works/cpp_reader.org" (100%)
 create mode 100644 "document\360\237\226\211/how_it_works/lexing.org"
 rename "document\360\237\226\211/tool_chain_dependency_layers.org" => "document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org" (100%)
 create mode 100644 "script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc"

diff --git "a/document\360\237\226\211/source/cpp.org" "b/document\360\237\226\211/how_it_works/cpp.org"
similarity index 100%
rename from "document\360\237\226\211/source/cpp.org"
rename to "document\360\237\226\211/how_it_works/cpp.org"
diff --git "a/document\360\237\226\211/source/cpp_reader.org" "b/document\360\237\226\211/how_it_works/cpp_reader.org"
similarity index 100%
rename from "document\360\237\226\211/source/cpp_reader.org"
rename to "document\360\237\226\211/how_it_works/cpp_reader.org"
diff --git "a/document\360\237\226\211/how_it_works/lexing.org" "b/document\360\237\226\211/how_it_works/lexing.org"
new file mode 100644
index 0000000..a6bed25
--- /dev/null
+++ "b/document\360\237\226\211/how_it_works/lexing.org"
@@ -0,0 +1,230 @@
+#+TITLE: GCC libcpp Lexer: Structure, Usage, and Extension
+#+AUTHOR: Caelus (OpenAI) and Thomas Walker Lynch
+#+DATE: 2025-05-09
+
+* Overview
+The C preprocessor lexer (`lex.cc`) in GCC's `libcpp` is responsible for scanning raw source characters and emitting `cpp_token` structures. It is Unicode-aware, macro-sensitive, context-tracking, and supports multiple levels of token buffering. This lexer is both a general-purpose lexical analyzer and a specialized component for preprocessing.
+
+This document provides:
+1. An architectural overview of how the lexer operates.
+2. Guidance on how to interface with it (i.e., how to invoke, initialize, and consume it).
+3. Examples demonstrating token flow and useful idioms.
+
+* 1. About the Lexer
+
+** 1.1 Services Provided
+The lexer transforms a stream of characters into a stream of `cpp_token`s. It performs:
+- UCN (Universal Character Name) expansion.
+- Unicode normalization for identifiers.
+- Detection of digraphs/trigraphs.
+- Skipping of whitespace and comments.
+- Classification into token types (`cpp_ttype`).
+- Optional macro expansion (via higher-level coordination with macro subsystem).
+
+The function `_cpp_lex_token()` is the main entry point for lexing one token from the input stream.
+
+** 1.2 Token Types and Structures
+Tokens are represented as `struct cpp_token`, which contains:
+- `type`: token kind (from `cpp_ttype`)
+- `val`: a union holding the value (e.g. number, string, identifier)
+- `flags`: indicators such as `PREV_WHITE` or `DIGRAPH`
+- `src_loc`: location for diagnostics
+- `spelling`: optional cached spelling (may be recomputed)
+
+Auxiliary structures include:
+- `cpp_hashnode`: interned identifiers and macro names
+- `normalize_state`: for handling normalization and BiDi context
+- `_cpp_buff`: dynamic buffers used for temporary token storage
+
+** 1.3 Unicode and Normalization
+Lexer supports bidirectional Unicode enforcement using:
+- `context`, `normalize_state`: track BiDi embeddings and UCN states
+- `on_char`, `on_close`, `maybe_warn_bidi_on_close`: enforce structure
+
+** 1.4 Vectorized Fast Path
+Several functions (e.g. `search_line_sse2`) accelerate scanning on x86 via SIMD. These are conditionally invoked from `search_line_fast` when alignment and CPU features allow.
+
+** 1.5 Token Buffers and Pools
+Token buffers are managed using `_cpp_get_buff`, `_cpp_extend_buff`, `_cpp_commit_buff`, and `_cpp_release_buff`. These form a scratch/reuse pool and reduce allocations in macro processing or lexing multiple tokens rapidly.
+
+* 2. How to Use the Lexer API
+
+** 2.1 Initialization
+Before lexing, the preprocessor must initialize its state:
+
+#+begin_src c
+cpp_reader *pfile = cpp_create_reader(GTK_TESTING, NULL, NULL);
+_cpp_init_lexer(pfile);
+_cpp_init_tokenrun(pfile);
+#+end_src
+
+** 2.2 Lexing Tokens
+To retrieve the next token:
+
+#+begin_src c
+const cpp_token *token = _cpp_lex_token(pfile);
+#+end_src
+
+For directive-specific parsing (no macro expansion):
+
+#+begin_src c
+cpp_token *token = _cpp_lex_direct(pfile);
+#+end_src
+
+** 2.3 Token Inspection
+Each token has type and value fields:
+
+#+begin_src c
+if (token->type == CPP_NUMBER) {
+  printf("Numeric token: %s\n", cpp_spell_token(pfile, token));
+}
+#+end_src
+
+** 2.4 Identifier Handling
+Lex identifiers directly (e.g., for macro lookup):
+
+#+begin_src c
+cpp_hashnode *node = _cpp_lex_identifier(pfile);
+if (cpp_macro_p(node)) {
+  // Node is a macro
+}
+#+end_src
+
+** 2.5 Stringification and Output
+To spell a token or output lines:
+
+#+begin_src c
+unsigned char *text = cpp_token_as_text(pfile, token);
+cpp_output_token(pfile, token, stdout);
+#+end_src
+
+* 3. Examples and Advanced Use
+
+** 3.1 Simple Token Stream
+Lex a stream from input and print token types:
+
+#+begin_src c
+while (true) {
+  const cpp_token *tok = _cpp_lex_token(pfile);
+  if (tok->type == CPP_EOF)
+    break;
+  printf("Token: %s\n", cpp_type2name(tok->type));
+}
+#+end_src
+
+** 3.2 Peeking and Lookahead
+Use `cpp_peek_token` to look ahead:
+
+#+begin_src c
+const cpp_token *next = cpp_peek_token(pfile);
+if (next->type == CPP_OPEN_PAREN)
+  printf("Function call?\n");
+#+end_src
+
+** 3.3 Handling Unicode Identifiers
+To support identifiers with UCNs:
+
+#+begin_src c
+cpp_hashnode *ident = _cpp_lex_identifier(pfile);
+const uchar *spell = _cpp_spell_ident_ucns(pfile, ident);
+printf("Normalized: %s\n", spell);
+#+end_src
+
+** 3.4 Example: Skipping Comments
+Use `_cpp_skip_block_comment` or `skip_line_comment`:
+
+#+begin_src c
+bool changed_line = _cpp_skip_block_comment(pfile);
+if (changed_line)
+  _cpp_clean_line(pfile);
+#+end_src
+
+** 3.5 Buffer Usage Examples
+
+*** 3.5.1 Allocate and Fill a Temporary Buffer
+Use `_cpp_get_buff` to allocate a scratch buffer. Always check and ensure space before writing. Then commit the buffer and retrieve its contents.
+
+#+begin_src c
+_cpp_buff *buff = _cpp_get_buff(pfile);
+size_t len = 5;  // Number of bytes to write
+
+// Ensure buffer has enough room
+if ((size_t)(buff->limit - buff->cur) < len)
+  _cpp_extend_buff(pfile, &buff);
+
+// Write data safely
+memcpy(buff->cur, "hello", len);
+buff->cur += len;
+
+// Commit buffer and retrieve stable pointer
+unsigned char *data = (unsigned char *) _cpp_commit_buff(pfile, buff, len);
+printf("Buffer contents: %.*s\n", (int)len, data);
+#+end_src
+*** 3.5.2 Extend a Buffer Dynamically
+Extend a buffer when you exceed its original size.
+
+#+begin_src c
+_cpp_buff *buff = _cpp_get_buff(pfile);
+
+// Simulate a long write
+for (int i = 0; i < 300; ++i) {
+  if ((size_t)(buff->limit - buff->cur) < 1) {
+    _cpp_extend_buff(pfile, &buff);
+  }
+  *buff->cur++ = 'A';
+}
+
+unsigned char *text = (unsigned char *) _cpp_commit_buff(pfile, buff, 300);
+printf("Expanded buffer: %.*s\n", 10, text); // First 10 chars
+#+end_src
+
+*** 3.5.3 Use Buffers in Token Construction
+Construct a macro expansion or synthetic token string.
+
+#+begin_src c
+_cpp_buff *buff = _cpp_get_buff(pfile);
+buff->cur = stpcpy((char *)buff->cur, "MY_MACRO(");
+buff->cur = stpcpy((char *)buff->cur, "123 + 456");
+*buff->cur++ = ')';
+
+unsigned char *macro_text = (unsigned char *) _cpp_commit_buff(pfile, buff,
+                                          buff->cur - buff->base);
+printf("Token string: %s\n", macro_text);
+#+end_src
+
+*** 3.5.4 Releasing a Buffer
+After using a buffer temporarily (e.g., in lookahead), release it.
+
+#+begin_src c
+_cpp_buff *buff = _cpp_get_buff(pfile);
+// ... use the buffer ...
+_cpp_release_buff(pfile, buff);
+#+end_src
+
+*** 3.5.5 Commit and Reuse
+After committing a buffer, you may allocate another for reuse:
+
+#+begin_src c
+unsigned char *first = (unsigned char *) _cpp_commit_buff(pfile, buff, len);
+_cpp_buff *next = _cpp_get_buff(pfile);
+// next->base points to fresh or recycled memory
+#+end_src
+
+* 4. Notes on Extension
+
+- You may insert a new directive (e.g., `#assign`) by defining it in `directives.cc` and adding handler logic in `macro.cc` or your own file.
+- If you want to extend the lexer for new token kinds, you must:
+  - Add a new `cpp_ttype` enum value.
+  - Extend `_cpp_lex_token` or `lex_string` to recognize and classify it.
+  - Update `cpp_type2name` and spelling functions.
+
+* 5. Recommended Reading
+- `libcpp/include/cpp-id-data.h`: For macro flags and token identifiers
+- `libcpp/lex.cc`: Lexer core implementation
+- `libcpp/directives.cc`: Directive parsing
+- `libcpp/macro.cc`: Macro expansion
+- `libcpp/line-map.cc`: Location tracking and diagnostics
+
+  
+
+
diff --git "a/document\360\237\226\211/tool_chain_dependency_layers.org" "b/document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org"
similarity index 100%
rename from "document\360\237\226\211/tool_chain_dependency_layers.org"
rename to "document\360\237\226\211/how_it_works/tool_chain_dependency_layers.org"
diff --git "a/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc" "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc"
new file mode 100644
index 0000000..21cd614
--- /dev/null
+++ "b/script_gcc_min-12\360\237\226\211/get_paren_balanced_chars.cc"
@@ -0,0 +1,48 @@
+bool
+cpp_get_paren_balanced_chars(cpp_reader *pfile)
+{
+  const unsigned char *cur = pfile->buffer->cur;
+  const unsigned char *rlimit = pfile->buffer->rlimit;
+  cpp_buffer *buffer = pfile->buffer;
+
+  // Skip leading whitespace
+  while (cur < rlimit && ISSPACE(*cur))
+    cur++;
+
+  if (cur == rlimit || *cur != '(')
+    {
+      cpp_error(pfile, CPP_DL_ERROR, "expected opening parenthesis for macro body");
+      return false;
+    }
+
+  int depth = 0;
+  const unsigned char *scan = cur;
+  while (scan < rlimit)
+    {
+      if (*scan == '(')
+        depth++;
+      else if (*scan == ')')
+        {
+          depth--;
+          if (depth == 0)
+            {
+              // Copy from opening ( to matching ) inclusive
+              size_t len = scan - cur + 1;
+              unsigned char *copy = (unsigned char *)_cpp_unaligned_alloc(pfile, len + 1);
+              memcpy(copy, cur, len);
+              copy[len] = '\0';
+
+              // Point lexer buffer to just this region
+              buffer->cur = copy;
+              buffer->rlimit = copy + len;
+              buffer->next_line = NULL; // Signals EOF to lexer
+              return true;
+            }
+        }
+      scan++;
+    }
+
+  // If we got here, closing paren was never found
+  cpp_error(pfile, CPP_DL_ERROR, "unclosed parenthesis in macro body");
+  return false;
+}
diff --git "a/script_gcc_min-12\360\237\226\211/macro.cc" "b/script_gcc_min-12\360\237\226\211/macro.cc"
index 82d8b4b..d491054 100644
--- "a/script_gcc_min-12\360\237\226\211/macro.cc"
+++ "b/script_gcc_min-12\360\237\226\211/macro.cc"
@@ -4138,7 +4138,7 @@ cpp_macro_definition (cpp_reader *pfile, cpp_hashnode *node,
 // see directives.cc
 extern const char *cpp_token_as_text(const cpp_token *token);
 
-// a helper function for probing where we are at in the parse
+// a helper function for probing where the parser thinks it is in the source
 void
 debug_peek_token (cpp_reader *pfile)
 {
@@ -4156,22 +4156,48 @@ debug_peek_token (cpp_reader *pfile)
   _cpp_backup_tokens(pfile, 1);
 }
 
+// collects the body of a #define or related directive 
 static bool
-collect_macro_body_tokens (cpp_reader *pfile,
-                           cpp_macro *macro,
-                           unsigned int *num_extra_tokens_out,
-                           const char *paste_op_error_msg)
-{
+collect_body_tokens(
+  cpp_reader *pfile
+  ,cpp_macro *macro
+  ,unsigned int *num_extra_tokens_out
+  ,const char *paste_op_error_msg
+  ,bool paren_matching
+){
   bool following_paste_op = false;
   unsigned int num_extra_tokens = 0;
+  int paren_depth;
+  cpp_token *token;
+
+  if(paren_matching){
+    token = _cpp_lex_direct(pfile);
+    if(token->type != CPP_OPEN_PAREN){
+      cpp_error_with_line(
+        pfile
+        ,CPP_DL_ERROR
+        ,token->src_loc
+        ,0
+        ,"expected body delimiter '(', but found: %s"
+        ,cpp_token_as_text(token)
+      );
+      fprintf(stderr, "exiting collect_body_tokens did not find opening paren\n");
+      return false;
+    }
+    paren_depth = 1;
+    fprintf( stderr, "entry paren_depth: %d\n", paren_depth);
+  }
 
   for (vaopt_state vaopt_tracker (pfile, macro->variadic, NULL);; )
     {
-      cpp_token *token = NULL;
-
+      // gets a token
+      //   first parses token onto `macro->exp.tokens[macro->count]`
+      //   then pulls the token off of `macro->exp.tokens[macro->count]`
       macro = lex_expansion_token(pfile, macro);
       token = &macro->exp.tokens[macro->count++];
+      fprintf( stderr, "top of loop, read token %s\n", cpp_token_as_text(token) );
 
+      // recognize macro args, give them type CPP_MACRO_ARG
       if (macro->count > 1 && token[-1].type == CPP_HASH && macro->fun_like)
         {
           if (token->type == CPP_MACRO_ARG
@@ -4193,27 +4219,57 @@ collect_macro_body_tokens (cpp_reader *pfile,
             {
               cpp_error(pfile, CPP_DL_ERROR,
                         "'#' is not followed by a macro parameter");
+              fprintf(stderr, "exiting collect_body_tokens not a macro arg and language is not ASM\n");
               return false;
             }
         }
 
-      if (token->type == CPP_EOF)
-        {
-          if (following_paste_op)
-            {
-              cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg);
-              return false;
-            }
-          if (!vaopt_tracker.completed())
+      // parentheses matching overhead
+      if(paren_matching){
+        if( token->type == CPP_OPEN_PAREN || token->type == CPP_CLOSE_PAREN){
+          if(token->type == CPP_OPEN_PAREN) paren_depth++;
+          if(token->type == CPP_CLOSE_PAREN) paren_depth--;
+          fprintf( stderr, "new paren_depth: %d\n", paren_depth);
+        }
+
+        if(token->type == CPP_EOF){
+          fprintf(stderr, "Found CPP_EOF at paren depth %d\n", paren_depth);
+          macro->count--;
+          if(!_cpp_get_fresh_line(pfile)){
+            fprintf(stderr, "exiting collect_body_tokens _cpp_get_fresh_line failed\n");
             return false;
-          break;
+          }
+          fprintf(stderr, "Found CPP_EOF at depth %d read new line now continuing loop \n", paren_depth);
+          continue;
         }
+      }
+
+      // exit loop at the end of the macro body
+      if( 
+        paren_matching && paren_depth == 0 
+        || !paren_matching && token->type == CPP_EOF
+      ){
+        fprintf(stderr, "exiting macro body collect loops\n");
+        if(following_paste_op){
+          cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg);
+          fprintf( stderr, "exiting collect_body_tokens due to following_past_op\n");
+          return false;
+        }
+        if( !vaopt_tracker.completed() ){
+          fprintf( stderr, "exiting collect_body_tokens due to !vaopt_tracker.completed()\n");
+          return false;
+        }
+        *num_extra_tokens_out = num_extra_tokens;
+        macro->count--; // drop the terminator
+        return true;
+      }
 
       if (token->type == CPP_PASTE)
         {
           if (macro->count == 1)
             {
               cpp_error(pfile, CPP_DL_ERROR, paste_op_error_msg);
+              fprintf( stderr, "exiting collect_body_tokens paste event\n");
               return false;
             }
 
@@ -4233,18 +4289,18 @@ collect_macro_body_tokens (cpp_reader *pfile,
             }
           following_paste_op = true;
         }
-      else
+      else{
         following_paste_op = false;
+      }
 
-      if (vaopt_tracker.update(token) == vaopt_state::ERROR)
+      if (vaopt_tracker.update(token) == vaopt_state::ERROR){
+        fprintf( stderr, "exiting collect_body_token due to vaopt_tracker.update(token) == vaopt_state::ERROR\n");
         return false;
+      }
     }
 
-  *num_extra_tokens_out = num_extra_tokens;
-  return true;
 }
 
-
 //--------------------------------------------------------------------------------
 // for `#macro` directive
 /*
@@ -4271,6 +4327,9 @@ create_iso_macro (cpp_reader *pfile)
   bool ok = false;
   cpp_macro *macro = NULL;
 
+int saved_in_directive = pfile->state.in_directive;
+int saved = pfile->keep_tokens;
+
   /* 
     -Saves token allocation address held in pfile->cur_token.
     -Gives a new token allocation address to pfile->cur_token, that of cpp_token first.
@@ -4330,16 +4389,35 @@ create_iso_macro (cpp_reader *pfile)
   macro->parm.params = params;
   macro->fun_like = true;
 
-  // collects from pfile the tokens that constitute the macro body
-  if (!collect_macro_body_tokens(pfile, macro, &num_extra_tokens, paste_op_error_msg))
-    goto out;
+  /* 
+    Collect the macro body tokens.
+    A #macro () body is delineated by parentheses
+  */
+
+
+pfile->state.in_directive = 0;  // allow fresh lines
+pfile->keep_tokens = 1;
+
+  // collects the remaining body tokens
+  if(
+    !collect_body_tokens(
+      pfile 
+      ,macro 
+      ,&num_extra_tokens 
+      ,paste_op_error_msg 
+      ,true
+    )
+  ) goto out;
+
+pfile->keep_tokens = saved;
+pfile->state.in_directive = saved_in_directive;  // restore
+
+
+
 
   // At this point, even if the body parse fails, we will say we made a macro. I'm not sure why as we haven't commited it yet, but this is what is in the code. Apparently we throw away the macro if the body does not parse.
   ok = true;
 
-  /* Don't count the CPP_EOF.  */
-  macro->count--;
-
   // commit the cpp struct to memory
   // the struct reserves space for one token, the others run off the end
   macro = (cpp_macro *)_cpp_commit_buff(
-- 
2.20.1