From: Thomas Walker Lynch <eknp9n@reasoningtechnology.com>
Date: Sat, 1 Mar 2025 10:35:43 +0000 (+0000)
Subject: checkpoint before unifying identity and reverse_order
X-Git-Url: https://git.reasoningtechnology.com/style/static/gitweb.css?a=commitdiff_plain;h=ead8eff707d7a5085ee98c41343d2c5f22c7c236;p=N

checkpoint before unifying identity and reverse_order
---

diff --git "a/developer/cc\360\237\226\211/Copy.lib.c" "b/developer/cc\360\237\226\211/Copy.lib.c"
index 4e40437..0db4d42 100644
--- "a/developer/cc\360\237\226\211/Copy.lib.c"
+++ "b/developer/cc\360\237\226\211/Copy.lib.c"
@@ -2,6 +2,7 @@
   Copy - Memory copy operations with attention to alignment.
   Provides optimized copy and byte order reversal functions.
 
+  'ATP'  At This Point in the code. Assertions follow.
 */
 
 #define CopyÂ·DEBUG
@@ -20,27 +21,50 @@
   #include <stdint.h>
   #include <stddef.h>
 
+  #define extentof(x) (sizeof(x) - 1)
+  #define extent_t size_t
+
   typedef struct{
     void *read0; 
-    size_t read_size;
+    extent_t read_extent;
     void *write0;
-    size_t write_size;
-  } CopyÂ·it;
+    extent_t write_extent;
+  } CopyÂ·It;
+
+  typedef enum{
+    CopyÂ·ItÂ·StatusÂ·valid = 0
+    ,CopyÂ·ItÂ·StatusÂ·null_read
+    ,CopyÂ·ItÂ·StatusÂ·null_write
+    ,CopyÂ·ItÂ·StatusÂ·overlap
+  } CopyÂ·ItÂ·Status;
 
   typedef enum{
-    CopyÂ·StatusÂ·perfect_fit = 0
-    ,CopyÂ·StatusÂ·argument_guard
-    ,CopyÂ·StatusÂ·read_surplus
-    ,CopyÂ·StatusÂ·read_surplus_write_gap
-    ,CopyÂ·StatusÂ·write_available
-    ,CopyÂ·StatusÂ·write_gap // write allocation has a terminal gap
+     CopyÂ·StepÂ·perfect_fit = 0
+    ,CopyÂ·StepÂ·argument_guard 
+    ,CopyÂ·StepÂ·read_surplus
+    ,CopyÂ·StepÂ·read_surplus_write_gap
+    ,CopyÂ·StepÂ·write_available
+    ,CopyÂ·StepÂ·write_gap
   } CopyÂ·Status;
 
   typedef struct{
-    void *region(void *read0 ,void *read1 ,void *write0);
+    bool CopyÂ·IntervalPtsÂ·in(void *pt, void *pt0 ,void *pt1);
+    bool CopyÂ·IntervalPtsÂ·contains(void *pt00 ,void *pt01 ,void *pt10 ,void *pt11);
+    bool CopyÂ·IntervalPtsÂ·overlap(void *pt00 ,void *pt01, void *pt10 ,void *pt11);
+
+    bool CopyÂ·IntervalPtSizeÂ·in(void *pt, void *pt0 ,size_t s);
+    bool CopyÂ·IntervalPtSizeÂ·overlap(void *pt00 ,size_t s0, void *pt10 ,size_t s1);
+
+    CopyÂ·ItÂ·Status CopyÂ·wellformed_it(CopyÂ·It *it)
+
+    void *identity(void *read0 ,void *read1 ,void *write0);
     void *reverse_byte_order(void *read0 ,void *read1 ,void *write0);
-  } CopyÂ·M;
 
+    CopyÂ·Status CopyÂ·StepÂ·identity(CopyÂ·It *it);
+    CopyÂ·Status CopyÂ·StepÂ·reverse_order(CopyÂ·It *it);
+    CopyÂ·Status CopyÂ·StepÂ·write_hex(CopyÂ·It *it);
+    CopyÂ·Status CopyÂ·StepÂ·read_hex(CopyÂ·It *it);
+  } CopyÂ·M;
 
 #endif
 
@@ -49,323 +73,320 @@
 
 #ifdef CopyÂ·IMPLEMENTATION
 
-  // this part goes into Nlib.a
+  #ifdef CopyÂ·DEBUG
+    #include <stdio.h>
+  #endif
+
+  // this part goes into Copylib.a
+  // yes this is empty, so there is no Copylib.a
   #ifndef LOCAL
   #endif 
 
   #ifdef LOCAL
 
+    // Interval predicates.
+    // Intervals in Copy have inclusive bounds
 
+    Local bool CopyÂ·IntervalPtsÂ·in(void *pt, void *pt0 ,void *pt1){
+      return pt >= pt0 && pt <= pt1; // Inclusive bounds
+    }
 
+    Local bool CopyÂ·in_extent_interval(void *pt, void *pt0 ,extent_t e){
+      return CopyÂ·IntervalPtsÂ·in(pt ,pt0 ,pt0 + e);
+    }
 
-      #ifdef CopyÂ·DEBUG
-        #include <stdio.h> // Only for debug prints, not used in production.
+    // interval 0 contains interval 1, overlap on boundaries allowed.
+    Local bool CopyÂ·IntervalPtsÂ·contains(
+      void *pt00 ,void *pt01 ,void *pt10 ,void *pt11
+    ){
+     return pt10 >= pt00 && pt11 <= pt01;
+    }
 
-typedef enum{
-   CopyÂ·StatusWFItÂ·none                  = 0x00
-  ,CopyÂ·StatusWFItÂ·null_read             = 0x01
-  ,CopyÂ·StatusWFItÂ·null_write            = 0x02
-  ,CopyÂ·StatusWFItÂ·zero_read_size        = 0x04
-  ,CopyÂ·StatusWFItÂ·zero_write_size       = 0x08
-  ,CopyÂ·StatusWFItÂ·write_too_small_hex   = 0x10
-  ,CopyÂ·StatusWFItÂ·read_too_small_hex    = 0x20
-  ,CopyÂ·StatusWFItÂ·read_larger_than_write = 0x40
-  ,CopyÂ·StatusWFItÂ·overlapping_buffers   = 0x80
-} CopyÂ·StatusWFIt;
+    // interval 0 properly contains interval 1, overlap on boundaries not allowed.
+    Local bool CopyÂ·contains_proper_pt_interval(
+      void *pt00 ,void *pt01 ,void *pt10 ,void *pt11
+    ){
+     return pt10 > pt00 && pt11 < pt01;
+    }
 
-typedef enum{
-   CopyÂ·ModeWFItÂ·none       = 0x00
-  ,CopyÂ·ModeWFItÂ·bytes      = 0x01
-  ,CopyÂ·ModeWFItÂ·reverse    = 0x02
-  ,CopyÂ·ModeWFItÂ·write_hex  = 0x03
-  ,CopyÂ·ModeWFItÂ·from_hex   = 0x04
-} CopyÂ·ModeWFIt;
+    // Possible cases of overlap, including just touching
+    // 1. interval 0 to the right of interval 1, just touching p00 == p11
+    // 2. interval 0 to the left of interval 1, just touching p01 == p10
+    // 3. interval 0 wholly contained in interval 1
+    // 4. interval 0 wholly contains interval 1
+    Local bool CopyÂ·IntervalPtsÂ·overlap(void *pt00 ,void *pt01, void *pt10 ,void *pt11){
+      return 
+        CopyÂ·IntervalPtsÂ·in(pt00 ,pt10 ,pt11) // #1, #3
+        || CopyÂ·IntervalPtsÂ·in(pt10 ,pt00 ,pt01) // #2, #4
+        ;
+    }
 
+    Local bool CopyÂ·overlap_extent_interval(void *pt00 ,extent_t e0, void *pt10 ,extent_t e1){
+      return CopyÂ·IntervalPtsÂ·overlap(pt00 ,pt00 + e0 ,pt10 ,pt10 + e1);
+    }
 
+    Local CopyÂ·ItÂ·Status CopyÂ·ItÂ·wellformed(CopyÂ·It *it){
+      char *this_name = "CopyÂ·ItÂ·wellformed";
+      CopyÂ·ItÂ·Status status = CopyÂ·ItÂ·StatusÂ·valid;
 
+      if(it->read0 == NULL){
+        fprintf(stderr, "%s: NULL read pointer\n", this_name);
+        status |= CopyÂ·ItÂ·StatusÂ·null_read;
+      }
 
-#endif
+      if(it->write0 == NULL){
+        fprintf(stderr, "%s: NULL write pointer\n", this_name);
+        status |= CopyÂ·ItÂ·StatusÂ·null_write;
+      }
 
+      if(
+        CopyÂ·overlap_extent_interval(it->read0 ,it->read_extent ,it->write0 ,it->write_extent)
+        ){
+          fprintf(stderr, "%s: Read and write buffers overlap!\n", this_name);
+          status |= CopyÂ·ItÂ·StatusÂ·overlap;
+      }
 
-    /*
-      CopyÂ·region - Copies a memory region while preserving byte order.
-      - Aligns reads for performance.
-      - Writes are assumed to be buffered and do not require alignment.
-      - Returns the updated write pointer.
-    */
-    Local void *CopyÂ·bytes(void *read0 ,void *read1 ,void *write0){
+      return status;
+    }
 
-      uint8_t *r = (uint8_t *)read0;
-      uint8_t *r1 = (uint8_t *)read1;
-      uint8_t *w = (uint8_t *)write0;
+    // consider an 8 byte window that is aligned
+    // returns the byte pointer to the least address byte in the window
+    Local void *CopyÂ·floor_64(void *p){
+      return (uintptr_t)p & ~(uintptr_t)0x7;
+    }
 
-      //----------
-      // The potentially unaligned initial part (align read pointer).
-      if( (uintptr_t)r & 0x7 ){
-
-        // ORing in `0x7` adds at most six bytes to r.
-        uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7);
-
-        // If the read interval is very small
-        if(r01 >= r1){
-          while(r < r1){
-            *w++ = *r++;
-          }
-          return w;
-        }
-
-        // Copy up to alignment boundary
-        do{ 
-          *w++ = *r++;
-        }while(r <= r01);
-      }
-      // r is now aligned, but *r has not yet been copied
+    // consider an 8 byte window that is aligned
+    // returns the byte pointer to the greatest address byte in the window
+    Local void *CopyÂ·ceiling_64(void *p){
+      return (uintptr_t)p | 0x7;
+    }
 
-      //----------
-      // The bulk copy part (w is still possibly unaligned, but r is aligned)
-      uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7);
+    // byte array greatest address byte at p1 (inclusive)
+    // byte array least address byte at p0 (inclusive)
+    // returns pointer to the greatest full 64-bit word-aligned address that is â¤ p1
+    // by contract, p1 must be >= p0
+    Local uint64_t *CopyÂ·greatest_full_64(void *p0 ,void *p1){
 
-      while(r < r10){
-        *(uint64_t *)w = *(uint64_t *)r;
-        w += 8;
-        r += 8;
-      }
+      // If p1 - 0x7 moves into a prior word while p0 does not, a prefetch hazard can occur.
+      // If p1 and p0 are more than 0x7 apart, they cannot be in the same word,
+      // but this does not guarantee a full 64-bit word exists in the range.
+      if (p1 - p0 < 0x7) return NULL;
 
-      // If r1 was aligned then r10 == r1 and we are done
-      if(r == r1) return w;
+      // Compute the last fully aligned word at or before p1.
+      uint64_t *p1_64 = (void *)( ((uintptr_t)p1 - 0x7) & ~(uintptr_t)0x7 );
 
-      //----------
-      // The ragged tail, up to 7 bytes
-      do{
-        *w++ = *r++;
-      }while(r < r1);
+      // If alignment rounds p1_64 below p0, there is no full word available.
+      if(p1_64 < p0) return NULL;
 
-      return w;
+      return p1_64;
     }
 
-    /*
-      CopyÂ·reverse_byte_order - Copies a memory region while reversing byte order.
-      - Reads from read1 down
-      - writes from write0 up
-      - Uses `__builtin_bswap64` for efficient 64-bit swaps.
-      - Returns the updated write pointer.
-    */
-    Local void *CopyÂ·bytes_reverse_order(void *read0 ,void *read1 ,void *write0){
+    // byte array greatest address byte at p1 (inclusive)
+    // byte array least address byte at p0 (inclusive)
+    // returns pointer to the least full 64-bit word-aligned address that is â¥ p0
+    Local uint64_t *CopyÂ·least_full_64(void *p0 ,void *p1){
+
+      // If p0 + 0x7 moves into the next word while p1 does not, a prefetch hazard can occur.
+      // If p1 and p0 are more than 0x7 apart, they cannot be in the same word,
+      // but this does not guarantee a full 64-bit word exists in the range.
+      if(p1 - p0 < 0x7) return NULL;
+
+      // Compute the first fully aligned word at or after p0.
+      uint64_t *p0_64 = (void *)( ((uintptr_t)p0 + 0x7) & ~(uintptr_t)0x7 );
+
+      // If alignment rounds p0_64 beyond p1, there is no full word available.
+      if(p0_64 > p1) return NULL;
+
+      return p0_64;
+    }
+
+    Local void *CopyÂ·inc64(void *p ,size_t Î){
+      return (void *)((uint64_t *)p) + Î;
+    }
+
+    Local void *CopyÂ·identity(void *read0 ,void *read1 ,void *write0){
+
+      //----------------------------------------
+      // argument guard
+
+      if(read1 < read0) return NULL;
+      // ATP there is at least one byte to be copied
+
+      //----------------------------------------
+      // features of the byte arrays, optimizer should move this code around
 
-      uint8_t *r = (uint8_t *)read1; // Start from the last byte
       uint8_t *r0 = (uint8_t *)read0;
-      uint8_t *w = (uint8_t *)write0;
+      uint8_t *r1 = (uint8_t *)read1; // inclusive upper bound
+      uint8_t *w0 = (uint8_t *)write0;
 
-      //----------
-      // The potentially unaligned initial part (align read pointer).
-      if( (uintptr_t)r & 0x7 ){
+      uint8_t *r = r0;
+      uint8_t *w = w0;
 
-        // ANDing with `~0x7` moves it downward to the nearest lower alignment.
-        uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7);
+      // the contained uint64_t array
+      uint64_t *r0_64 = CopyÂ·least_full_64(r0 ,r1);
+      uint64_t *r1_64 = CopyÂ·greatest_full_64(r0 ,r1);
 
-        // If the read interval is very small
-        if(r10 < r0){
-          while(r > r0){
-            *w++ = *--r;
-          }
-          return w;
-        }
+      // ATP there might be unaligned smallest address in the array bytes
 
-        // Copy down to alignment boundary
+      // In fact, r0_64 and r1_64 being NULL will always occur together.
+      // .. then there are not many bytes to be copied
+      if(r0_64 == NULL || r1_64 == NULL){
         do{
-          *w++ = *--r;
-        }while(r > r10);
+          *w = *r;
+          if(r == r1) break;
+          w++;
+          r++;
+        }while(true);
+        return w;
       }
-      // r is now aligned, and *r has been copied
-
-      //----------
-      // The bulk copy part
-      uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7);
 
-      while(r > r01){
-        r -= 8;
-        *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r);
-        w += 8;
+      // if needed, align r
+      while(r < r0_64){ 
+        *w++ = *r++;
       }
+      // ATP r == r0_64, though *r has not yet been copied
+      // ATP r is uint64_t aligned
+      // ATP there is at least one word to be copied
+      // ATP w is possibly not aligned
 
-      // If r0 was aligned then r01 == r0 and we are done
-      if(r < r0) return w;
+      //----------------------------------------
+      // The bulk copy part 
+
+      do{
+        *(uint64_t *)w = *(uint64_t *)r;
+        if(r == r1_64) break;
+        w = CopyÂ·inc64(w ,1);
+        r = CopyÂ·inc64(r ,1);
+      }while(true);
+      // ATP r == r1_64
+
+      // If r1 was aligned the copy is done
+      bool aligned_r1 = (uintptr_t)r1 == (uintptr_t)CopyÂ·ceiling_64(r1_64);
+      if(aligned_r1) return w;
+      r = CopyÂ·inc_64(r ,1);
+      w = CopyÂ·inc_64(w ,1);
+      // ATP there is at least one trailing unaligned byte to copy
+      // *r has not yet been copied, but needs to be
 
       //----------
       // The ragged tail, up to 7 bytes
       do{
-        *w++ = *--r;
-      }while(r >= r0);
+        *w = *r;
+        if(r == r1) break;
+        w++;
+        r++;
+      }while(true);
 
       return w;
     }
 
-    /* 
-       Read buffer is read from the lowest address, working toward higher addresses.
+    Local void *CopyÂ·reverse_byte_order(void *read0 ,void *read1 ,void *write0){
 
-       Write buffer is written from the lowest address, working to higher addresses.
+      //----------------------------------------
+      // Argument guard
 
-       To force data to be left in the read buffer, or for capacity to be left in the
-       write buffer, reduce sizes.
-    */
-    Local CopyÂ·Status CopyÂ·step(
-      CopyÂ·it *it
-    ){
-      uint8_t *r = (uint8_t *)it->read0;
-      uint8_t *w = (uint8_t *)it->write0;
+      if(read1 < read0) return NULL;
+      // ATP there is at least one byte to be copied
 
-      size_t rs = it->read_size;
-      size_t ws = it->write_size;
-
-      if(ws >= rs){
-        CopyÂ·bytes(r ,r + rs ,w);
-        it->read0 += rs;
-        it->read_size = 0;
-        it->write0 += rs;
-        it->write_size -= rs;
-        if(ws == rs) return CopyÂ·StatusÂ·perfect_fit;
-        return CopyÂ·StatusÂ·write_available;;
-      }
+      //----------------------------------------
+      // Features of the byte arrays, optimizer should move this code around
 
-      // ws < rs
-      CopyÂ·bytes(r ,r + ws ,w);
-      it->read0 += ws;
-      it->read_size -= ws;
-      it->write_size = 0;
-      it->write0 += ws;
-      return CopyÂ·StatusÂ·read_surplus;
-   }
-
-    /* 
-       Read buffer is read from top down. Start with the largest address
-       just above the read buffer. Continue into lower addresses.
-
-       write buffer is written from bottom up. Start with the lowest address,
-       continue into higher addresses.
-    */
-    Local CopyÂ·Status CopyÂ·step_reverse_order(CopyÂ·it *it){
-      // How many bytes remain to be read/written
-      if( it->read_size  == 0) return CopyÂ·StatusÂ·complete;
-      size_t rs = it->read_size;
-      uint8_t *r1   = (uint8_t *)it->read0 + rs;
-      size_t ws = it->write_size;
-      uint8_t *w0 = (uint8_t *)it->write0;
-
-      if(ws >= rs){
-        uint8_t *r0 = (uint8_t *)it->read0;
-        CopyÂ·bytes_reverse_order(r0, r1, w0);
-        it->read_size = 0;
-        it->write0 += rs;
-        it->write_size -= rs;
-        if(it->write_size == 0) return CopyÂ·StatusÂ·perfect_fit;
-        return CopyÂ·StatusÂ·write_available;
-      }
+      uint8_t *r0 = (uint8_t *)read0;
+      uint8_t *r1 = (uint8_t *)read1; // inclusive upper bound
+      uint8_t *w0 = (uint8_t *)write0;
 
-      // ws < rs
-      uint8_t *r0 = r1 - ws;
-      CopyÂ·bytes_reverse_order(r0, r1, w0);
-      it->read0 -= ws;
-      it->read_size -= ws;
-      it->write_size = 0;
-      it->write0 += ws;
-      return CopyÂ·StatusÂ·read_surplus;
-    }
+      uint8_t *r = r1; // Start from the last byte
+      uint8_t *w = w0;
 
-    /*
-      Read bytes, write hex pairs.
-      Read and write are low address to high address.
-      Each read byte value -> 2 write allocation bytes
-    */
-    Local CopyÂ·Status CopyÂ·step_write_hex(
-      CopyÂ·it *it
-    ){
+      // The contained uint64_t array
+      uint64_t *r0_64 = CopyÂ·least_full_64(r0 ,r1);
+      uint64_t *r1_64 = CopyÂ·greatest_full_64(r0 ,r1);
 
-      uint8_t *r = (uint8_t *)it->read0;
-      size_t rs = it->read_size;
+      // ATP there might be unaligned highest address in the array bytes
 
-      uint8_t *w = (uint8_t *)it->write0;
-      size_t  ws = it->write_size & ~1; // even number write_size 
-      size_t ews = it->write_size >> 1; // effective write size 
-
-      // If ews >= rs, read bytes all coped
-      if(ews >= rs){
-        size_t ers = it->read_size << 1; // effective read size
-        it->write0 += ers;
-        it->write_size -= ers;
-        while(rs--){
-          *(uint16_t *)w = CopyÂ·byte_to_hex(*r++);
-          w += 2;
-        }
-        it->read0 = r;
-        it->read_size = 0;
-
-        if(it->write_size == 0) return CopyÂ·StatusÂ·perfect_fit;
-        if(it->write_size == 1) return CopyÂ·StatusÂ·write_gap;
-        return CopyÂ·StatusÂ·write_available;
+      // If no full words exist, fallback to byte-wise copying
+      if(r0_64 == NULL || r1_64 == NULL){
+        do{
+          *w = *r;
+          if(r == r0) break;
+          w++;
+          r--;
+        }while(true);
+        return w;
       }
 
-      // ews < rs, write allocation all used, read bytes surplus
-      it->read0 += ews;
-      it->read_size -= ews;
-      while(ews--){
-        *(uint16_t *)w = CopyÂ·byte_to_hex(*r++);
-        w += 2;
+      // If needed, align r
+      while(r > (uint8_t *)r1_64){
+        *w++ = *r--;
       }
-      it->write0 = w;
-      it->write_size -= ws;
+      // ATP r == r1_64, though *r has not yet been copied
+      // ATP r is uint64_t aligned
+      // ATP there is at least one word to be copied
+      // ATP w is possibly not aligned
+
+      //----------------------------------------
+      // The bulk copy part 
+
+      do{
+        *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r);
+        if(r == r0_64) break;
+        w = CopyÂ·inc64(w ,1);
+        r = CopyÂ·inc64(r ,-1);
+      }while(true);
+      // ATP r == r0_64
+
+      // If r0 was aligned, the copy is done
+      bool aligned_r0 = (uintptr_t)r0 == (uintptr_t)CopyÂ·floor_64(r0_64);
+      if(aligned_r0) return w;
+
+      r = CopyÂ·inc64(r ,-1);
+      w = CopyÂ·inc64(w ,1);
+      // ATP there is at least one trailing unaligned byte to copy
+      // *r has not yet been copied, but needs to be
+
+      //----------
+      // The ragged tail, up to 7 bytes
+      do{
+        *w = *r;
+        if(r == r0) break;
+        w++;
+        r--;
+      }while(true);
 
-      if(it->write_size == 1) return CopyÂ·StatusÂ·read_surplus_write_gap;
-      return CopyÂ·StatusÂ·read_surplus;
+      return w;
     }
 
+
     /*
-      Read hex pairs, write bytes.
-      Read is low address to high address.
-      Write is low address to high address.
-      Each read hex pair -> 1 write byte.
+      Read and write pointers are incremented by `extent + 1`, ensuring they do not skip
+      past the last valid byte. The previous `+1` was incorrect in cases where
+      stepping already processed the last byte.
     */
-    Local CopyÂ·Status CopyÂ·step_from_hex(
-      CopyÂ·it *it
-    ){
+    Local CopyÂ·Status CopyÂ·StepÂ·identity(CopyÂ·It *it){
       uint8_t *r = (uint8_t *)it->read0;
-      size_t rs = it->read_size & ~1; // Must be even for hex pairs.
-      size_t ers = rs >> 1; // Effective read size: half the number of bytes.
-
       uint8_t *w = (uint8_t *)it->write0;
-      size_t ws = it->write_size; // Write size already in bytes.
-
-      // If ws >= ers, all hex values are processed
-      if(ws >= ers){
-        while(ers--){
-          *w++ = CopyÂ·hex_to_byte(*(uint16_t *)r);
-          r += 2;
-        }
-
-        it->read0 = r;
-        it->read_size -= rs;
-        it->write0 = w;
-        it->write_size -= rs >> 1; // Each byte consumes two hex chars.
-
-        if(it->write_size == 0) return CopyÂ·StatusÂ·perfect_fit;
-        return CopyÂ·StatusÂ·write_available;
-      }
 
-      // ws < ers, read allocation surplus
-      while(ws--){
-        *w++ = CopyÂ·hex_to_byte(*(uint16_t *)r);
-        r += 2;
+      extent_t re = it->read_extent;
+      extent_t we = it->write_extent;
+
+      if(we >= re){
+        CopyÂ·bytes(r ,r + re ,w);
+        it->read0 += re;  // Fixed stepping logic
+        it->read_extent = 0;
+        it->write0 += re;
+        it->write_extent -= re;
+        if(we == re) return CopyÂ·StepÂ·perfect_fit;
+        return CopyÂ·StepÂ·write_available;
       }
 
-      it->read0 = r;
-      it->read_size -= ws << 1; // Each write byte consumes two hex chars.
-      it->write0 = w;
-      it->write_size = 0;
-
-      return CopyÂ·StatusÂ·read_surplus;
+      CopyÂ·bytes(r ,r + we ,w);
+      it->read0 += we;  // Fixed stepping logic
+      it->read_extent -= we;
+      it->write_extent = 0;
+      it->write0 += we;
+      return CopyÂ·StepÂ·read_surplus;
     }
 
-
   #endif // LOCAL
 
-
 #endif // IMPLEMENTATION
diff --git "a/developer/cc\360\237\226\211/update.lib.c" "b/developer/cc\360\237\226\211/update.lib.c"
new file mode 100644
index 0000000..61c5cf4
--- /dev/null
+++ "b/developer/cc\360\237\226\211/update.lib.c"
@@ -0,0 +1,107 @@
+/*
+  Copy - Memory copy operations with attention to alignment.
+  Provides optimized copy and byte order reversal functions.
+*/
+
+#define CopyÂ·DEBUG
+
+#ifndef FACE
+#define CopyÂ·IMPLEMENTATION
+#define FACE
+#endif 
+
+//--------------------------------------------------------------------------------
+// Interface
+
+#ifndef CopyÂ·FACE
+#define CopyÂ·FACE
+
+  #include <stdint.h>
+  #include <stddef.h>
+
+  typedef struct{
+    void *read0 
+    ,size_t read_size
+    ,void *write0
+    ,size_t write_size;
+  } CopyÂ·it;
+
+  typedef enum{
+     CopyÂ·StatusÂ·perfect_fit = 0
+    ,CopyÂ·StatusÂ·argument_guard
+    ,CopyÂ·StatusÂ·read_surplus
+    ,CopyÂ·StatusÂ·read_surplus_write_gap
+    ,CopyÂ·StatusÂ·write_available
+    ,CopyÂ·StatusÂ·write_gap;
+  } CopyÂ·Status;
+
+  typedef enum{
+    CopyÂ·WFItÂ·ModeÂ·none = 0
+    ,CopyÂ·WFItÂ·ModeÂ·bytes
+    ,CopyÂ·WFItÂ·ModeÂ·bytes_reverse
+    ,CopyÂ·WFItÂ·ModeÂ·write_hex
+    ,CopyÂ·WFItÂ·ModeÂ·read_hex;
+  } CopyÂ·WFItÂ·Mode;
+
+  typedef enum{
+    CopyÂ·WFItÂ·StatusÂ·valid = 0
+    ,CopyÂ·WFItÂ·StatusÂ·null_read
+    ,CopyÂ·WFItÂ·StatusÂ·null_write
+    ,CopyÂ·WFItÂ·StatusÂ·zero_buffer
+    ,CopyÂ·WFItÂ·StatusÂ·overlap
+    ,CopyÂ·WFItÂ·StatusÂ·write_too_small;
+  } CopyÂ·WFItÂ·Status;
+
+  typedef struct{
+    void *region( void *read0 ,void *read1 ,void *write0 )
+    ,void *reverse_byte_order( void *read0 ,void *read1 ,void *write0 );
+  } CopyÂ·M;
+
+#endif
+
+//--------------------------------------------------------------------------------
+// Implementation
+
+#ifdef CopyÂ·IMPLEMENTATION
+
+  // this part goes into Nlib.a
+  #ifndef LOCAL
+  #endif 
+
+  #ifdef LOCAL
+
+    Local CopyÂ·WFItÂ·Status CopyÂ·wellformed_it(CopyÂ·it *it){
+      char *this_name = "CopyÂ·wellformed_it";
+      CopyÂ·WFItÂ·Status status = CopyÂ·WFItÂ·StatusÂ·valid;
+
+      if(it->read0 == NULL){
+        fprintf( stderr ,"%s: NULL read pointer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·null_read;
+      }
+
+      if(it->write0 == NULL){
+        fprintf( stderr ,"%s: NULL write pointer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·null_write;
+      }
+
+      if(it->read_size == 0){
+        fprintf( stderr ,"%s: Zero-sized read buffer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·zero_read_buffer;
+      }
+
+      if(it->write_size == 0){
+        fprintf( stderr ,"%s: Zero-sized write buffer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·zero_write_buffer;
+      }
+
+      if( CopyÂ·overlap_size_interval(it->read0 ,it->read_size ,it->write0 ,it->write_size) ){
+        fprintf( stderr ,"%s: Read and write buffers overlap!\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·overlap;
+      }
+
+      return status;
+    }
+
+  #endif // LOCAL
+
+#endif // IMPLEMENTATION
diff --git "a/developer/deprecated\360\237\226\211/Copy.lib.c" "b/developer/deprecated\360\237\226\211/Copy.lib.c"
new file mode 100644
index 0000000..4ea9ce5
--- /dev/null
+++ "b/developer/deprecated\360\237\226\211/Copy.lib.c"
@@ -0,0 +1,424 @@
+/*
+  Copy - Memory copy operations with attention to alignment.
+  Provides optimized copy and byte order reversal functions.
+
+*/
+
+#define CopyÂ·DEBUG
+
+#ifndef FACE
+#define CopyÂ·IMPLEMENTATION
+#define FACE
+#endif 
+
+//--------------------------------------------------------------------------------
+// Interface
+
+#ifndef CopyÂ·FACE
+#define CopyÂ·FACE
+
+  #include <stdint.h>
+  #include <stddef.h>
+
+  #define extentof(x) (sizeof(x)-1)
+
+  typedef struct{
+    void *read0; 
+    size_t read_size;
+    void *write0;
+    size_t write_size;
+  } CopyÂ·it;
+
+  // returned from the `step_X` functions
+  typedef enum{
+     CopyÂ·StepÂ·perfect_fit = 0
+    ,CopyÂ·StepÂ·argument_guard 
+    ,CopyÂ·StepÂ·read_surplus
+    ,CopyÂ·StepÂ·read_surplus_write_gap
+    ,CopyÂ·StepÂ·write_availableCopyÂ·StatusÂ·
+    ,CopyÂ·StepÂ·write_gap;
+  } CopyÂ·Status;
+
+  typedef enum{
+    CopyÂ·WFItÂ·StatusÂ·valid = 0
+    ,CopyÂ·WFItÂ·StatusÂ·null_read
+    ,CopyÂ·WFItÂ·StatusÂ·zero_size_read
+    ,CopyÂ·WFItÂ·StatusÂ·null_write
+    ,CopyÂ·WFItÂ·StatusÂ·zero_size_write
+    ,CopyÂ·WFItÂ·StatusÂ·overlap
+  } CopyÂ·WFItÂ·Status;
+
+  // function dictionary
+  typedef struct{
+    void *bytes(void *read0 ,void *read1 ,void *write0);
+    void *reverse_byte_order(void *read0 ,void *read1 ,void *write0);
+    CopyÂ·WFItÂ·Status CopyÂ·wellformed_it(CopyÂ·it *it ,CopyÂ·WFItÂ·Mode mode);
+  } CopyÂ·M;
+
+#endif
+
+//--------------------------------------------------------------------------------
+// Implementation
+
+#ifdef CopyÂ·IMPLEMENTATION
+
+  #ifdef CopyÂ·DEBUG
+    #include <stdio.h> // Only for debug prints, not used in production.
+  #endif
+
+
+  // this part goes into Copylib.a
+  // yes this is empty, so there is no Copylib.a
+  #ifndef LOCAL
+  #endif 
+
+  #ifdef LOCAL
+
+    // Interval predicates.
+    // Intervals in Copy have an exclusive upper bound
+
+    Local bool CopyÂ·in_pt_interval(void *pt, void *pt0 ,void *pt1){
+      return pt >= pt0 && pt < pt1;
+    }
+    Local bool CopyÂ·in_size_interval(void *pt, void *pt0 ,size_t s){
+      return CopyÂ·in_pt_interval(pt ,pt0 ,pt0 + s);
+    }
+
+    // interval 0 contains interval 1, overlap on boundaries allowed.
+    Local bool CopyÂ·contains_pt_interval(
+      void *pt00 ,void *pt01 ,void *pt10 ,void *pt11
+    ){
+     return
+       pt10 >= pt00 && pt11 <= pt01
+       ;
+    }
+
+    // Possible cases of overlap
+    // 1. interval 0 to the left of interval 1
+    // 2. interval 0 to the right of interval 1
+    // 3. interval 0 wholly contained in interval 1
+    // 4. interval 0 wholly contains interval 1
+    Local bool CopyÂ·overlap_pt_interval(void *pt00 ,void *pt01, void *pt10 ,void *pt11){
+      void *pt01_inclusive = pt01 - 1;
+      void *pt11_inclusive = pt11 - 1;
+      return 
+        CopyÂ·in_pt_interval(pt10 ,pt00 ,pt01) // #1, #4
+        ||
+        CopyÂ·in_pt_interval(pt00 ,pt10 ,pt11) // #2, #3
+        ;
+    }
+    Local bool CopyÂ·overlap_size_interval(void *pt00 ,size_t s0, void *pt10 ,size_t s1){
+      return CopyÂ·overlap_pt_interval(pt00 ,pt00 + s0 ,pt10 ,pt10 + s1);
+    }
+
+    Local CopyÂ·WFItÂ·Status CopyÂ·wellformed_it(CopyÂ·it *it){
+      char *this_name = "CopyÂ·wellformed_it";
+      CopyÂ·WFItÂ·Status status = CopyÂ·WFItÂ·StatusÂ·valid;
+
+      if(it->read0 == NULL){
+        fprintf( stderr ,"%s: NULL read pointer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·null_read;
+      }
+
+      if(it->write0 == NULL){
+        fprintf( stderr ,"%s: NULL write pointer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·null_write;
+      }
+
+      if(it->read_size == 0){
+        fprintf( stderr ,"%s: Zero-sized read buffer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·zero_read_buffer;
+      }
+
+      if(it->write_size == 0){
+        fprintf( stderr ,"%s: Zero-sized write buffer\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·zero_write_buffer;
+      }
+
+      if( CopyÂ·overlap_size_interval(it->read0 ,it->read_size ,it->write0 ,it->write_size) ){
+        fprintf( stderr ,"%s: Read and write buffers overlap!\n" ,this_name );
+        status |= CopyÂ·WFItÂ·StatusÂ·overlap;
+      }
+
+      return status;
+    }
+
+    /*
+      Identity function. read interval values are copied without modification of value
+      or order to the write allocation.
+      - Aligns reads for performance.
+      - Writes are assumed to be buffered and do not require alignment.
+      - Returns the updated write pointer.
+      - See doc 'Copy.org' for more details.
+    */
+    Local void *CopyÂ·identity(void *read0 ,void *read1 ,void *write0){
+
+      uint8_t *r = (uint8_t *)read0;
+      uint8_t *r1 = (uint8_t *)read1;
+      uint8_t *w = (uint8_t *)write0;
+
+      //----------
+      // The potentially unaligned initial part (align read pointer).
+      if( (uintptr_t)r & 0x7 ){
+
+        // at this point  r == r0, the lower bound of the read interval
+        // r0 | `0x7` adds at most six bytes to r.
+        uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7);
+
+        // If the read interval is very small
+        if(r01 >= r1){
+          while(r < r1){
+            *w++ = *r++;
+          }
+          return w;
+        }
+
+        // Copy up to alignment boundary
+        do{ 
+          *w++ = *r++;
+        }while(r <= r01);
+      }
+      // r is now aligned, but *r has not yet been copied
+
+      //----------
+      // The bulk copy part (w is still possibly unaligned, but r is aligned)
+      uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7);
+
+      while(r < r10){
+        *(uint64_t *)w = *(uint64_t *)r;
+        w += 8;
+        r += 8;
+      }
+
+      // If r1 was aligned then r10 == r1 and we are done
+      if(r == r1) return w;
+
+      //----------
+      // The ragged tail, up to 7 bytes
+      do{
+        *w++ = *r++;
+      }while(r < r1);
+
+      return w;
+    }
+
+    /*
+      CopyÂ·reverse_byte_order - Copies a memory region while reversing byte order.
+      - Reads from read1 down
+      - writes from write0 up
+      - Uses `__builtin_bswap64` for efficient 64-bit swaps.
+      - Returns the updated write pointer.
+    */
+    Local void *CopyÂ·bytes_reverse_order(void *read0 ,void *read1 ,void *write0){
+
+      uint8_t *r = (uint8_t *)read1; // Start from the last byte
+      uint8_t *r0 = (uint8_t *)read0;
+      uint8_t *w = (uint8_t *)write0;
+
+      //----------
+      // The potentially unaligned initial part (align read pointer).
+      if( (uintptr_t)r & 0x7 ){
+
+        // ANDing with `~0x7` moves it downward to the nearest lower alignment.
+        uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7);
+
+        // If the read interval is very small
+        if(r10 < r0){
+          while(r > r0){
+            *w++ = *--r;
+          }
+          return w;
+        }
+
+        // Copy down to alignment boundary
+        do{
+          *w++ = *--r;
+        }while(r > r10);
+      }
+      // r is now aligned, and *r has been copied
+
+      //----------
+      // The bulk copy part
+      uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7);
+
+      while(r > r01){
+        r -= 8;
+        *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r);
+        w += 8;
+      }
+
+      // If r0 was aligned then r01 == r0 and we are done
+      if(r < r0) return w;
+
+      //----------
+      // The ragged tail, up to 7 bytes
+      do{
+        *w++ = *--r;
+      }while(r >= r0);
+
+      return w;
+    }
+
+    /* 
+       Read buffer is read from the lowest address, working toward higher addresses.
+
+       Write buffer is written from the lowest address, working to higher addresses.
+
+       To force data to be left in the read buffer, or for capacity to be left in the
+       write buffer, reduce sizes.
+    */
+    Local CopyÂ·Status CopyÂ·step(CopyÂ·it *it){
+      uint8_t *r = (uint8_t *)it->read0;
+      uint8_t *w = (uint8_t *)it->write0;
+
+      size_t rs = it->read_size;
+      size_t ws = it->write_size;
+
+      if(ws >= rs){
+        CopyÂ·bytes(r ,r + rs ,w);
+        it->read0 += rs;
+        it->read_size = 0;
+        it->write0 += rs;
+        it->write_size -= rs;
+        if(ws == rs) return CopyÂ·StepÂ·perfect_fit;
+        return CopyÂ·StepÂ·write_available;;
+      }
+
+      // ws < rs
+      CopyÂ·bytes(r ,r + ws ,w);
+      it->read0 += ws;
+      it->read_size -= ws;
+      it->write_size = 0;
+      it->write0 += ws;
+      return CopyÂ·StepÂ·read_surplus;
+   }
+
+    /* 
+       Read buffer is read from top down. Start with the largest address
+       just above the read buffer. Continue into lower addresses.
+
+       write buffer is written from bottom up. Start with the lowest address,
+       continue into higher addresses.
+    */
+    Local CopyÂ·Status CopyÂ·step_reverse_order(CopyÂ·it *it){
+      // How many bytes remain to be read/written
+      if( it->read_size  == 0) return CopyÂ·StepÂ·complete;
+      size_t rs = it->read_size;
+      uint8_t *r1   = (uint8_t *)it->read0 + rs;
+      size_t ws = it->write_size;
+      uint8_t *w0 = (uint8_t *)it->write0;
+
+      if(ws >= rs){
+        uint8_t *r0 = (uint8_t *)it->read0;
+        CopyÂ·bytes_reverse_order(r0, r1, w0);
+        it->read_size = 0;
+        it->write0 += rs;
+        it->write_size -= rs;
+        if(it->write_size == 0) return CopyÂ·StepÂ·perfect_fit;
+        return CopyÂ·StepÂ·write_available;
+      }
+
+      // ws < rs
+      uint8_t *r0 = r1 - ws;
+      CopyÂ·bytes_reverse_order(r0, r1, w0);
+      it->read0 -= ws;
+      it->read_size -= ws;
+      it->write_size = 0;
+      it->write0 += ws;
+      return CopyÂ·StepÂ·read_surplus;
+    }
+
+    /*
+      Read bytes, write hex pairs.
+      Read and write are low address to high address.
+      Each read byte value -> 2 write allocation bytes
+    */
+    Local CopyÂ·Status CopyÂ·step_write_hex(CopyÂ·it *it){
+
+      uint8_t *r = (uint8_t *)it->read0;
+      size_t rs = it->read_size;
+
+      uint8_t *w = (uint8_t *)it->write0;
+      size_t  ws = it->write_size & ~1; // even number write_size 
+      size_t ews = it->write_size >> 1; // effective write size 
+
+      // If ews >= rs, read bytes all coped
+      if(ews >= rs){
+        size_t ers = it->read_size << 1; // effective read size
+        it->write0 += ers;
+        it->write_size -= ers;
+        while(rs--){
+          *(uint16_t *)w = CopyÂ·byte_to_hex(*r++);
+          w += 2;
+        }
+        it->read0 = r;
+        it->read_size = 0;
+
+        if(it->write_size == 0) return CopyÂ·StepÂ·perfect_fit;
+        if(it->write_size == 1) return CopyÂ·StepÂ·write_gap;
+        return CopyÂ·StepÂ·write_available;
+      }
+
+      // ews < rs, write allocation all used, read bytes surplus
+      it->read0 += ews;
+      it->read_size -= ews;
+      while(ews--){
+        *(uint16_t *)w = CopyÂ·byte_to_hex(*r++);
+        w += 2;
+      }
+      it->write0 = w;
+      it->write_size -= ws;
+
+      if(it->write_size == 1) return CopyÂ·StepÂ·read_surplus_write_gap;
+      return CopyÂ·StepÂ·read_surplus;
+    }
+
+    /*
+      Read hex pairs, write bytes.
+      Read is low address to high address.
+      Write is low address to high address.
+      Each read hex pair -> 1 write byte.
+    */
+    Local CopyÂ·Status CopyÂ·step_read_hex(CopyÂ·it *it){
+      uint8_t *r = (uint8_t *)it->read0;
+      size_t rs = it->read_size & ~1; // Must be even for hex pairs.
+      size_t ers = rs >> 1; // Effective read size: half the number of bytes.
+
+      uint8_t *w = (uint8_t *)it->write0;
+      size_t ws = it->write_size; // Write size already in bytes.
+
+      // If ws >= ers, all hex values are processed
+      if(ws >= ers){
+        while(ers--){
+          *w++ = CopyÂ·hex_to_byte(*(uint16_t *)r);
+          r += 2;
+        }
+
+        it->read0 = r;
+        it->read_size -= rs;
+        it->write0 = w;
+        it->write_size -= rs >> 1; // Each byte consumes two hex chars.
+
+        if(it->write_size == 0) return CopyÂ·StepÂ·perfect_fit;
+        return CopyÂ·StepÂ·write_available;
+      }
+
+      // ws < ers, read allocation surplus
+      while(ws--){
+        *w++ = CopyÂ·hex_to_byte(*(uint16_t *)r);
+        r += 2;
+      }
+
+      it->read0 = r;
+      it->read_size -= ws << 1; // Each write byte consumes two hex chars.
+      it->write0 = w;
+      it->write_size = 0;
+
+      return CopyÂ·StepÂ·read_surplus;
+    }
+
+
+  #endif // LOCAL
+
+
+#endif // IMPLEMENTATION
diff --git "a/developer/document\360\237\226\211/Copy.org" "b/developer/document\360\237\226\211/Copy.org"
index 38b9729..6ab73ae 100644
--- "a/developer/document\360\237\226\211/Copy.org"
+++ "b/developer/document\360\237\226\211/Copy.org"
@@ -70,3 +70,24 @@ CopyÂ·Status result = CopyÂ·step(&copy_instance);
 
 This module provides an efficient and flexible memory copying framework suitable for low-level data manipulation tasks.
 
+* Relevant todo note
+
+2025-02-24T08:09:57Z Copy.lib.c forced alginment machine might have issues with the block copy.
+
+  The block copy aligns the read pointer by copying some initial
+  bytes. It ignores the alignment on the write pointer. Then at the end it does a byte
+  by byte copy of the ragged tail (less than a full word number of bytes).
+
+  For a system that forces alignment, the initial alignment of the read pointer will get skipped. The write pointer will be aligned, so there is no problem in not checking it.
+
+  However, the ragged tail loop can fire on a forced aligned
+  system. This will happen if the bounding read pointer passed in to
+  the block copy is not pointing to the first byte of a word. This can
+  happen if it is created adding `sizeof` of an object with that is not an even number
+  of bytes in a word long.
+
+  The solution is probably to set a 'force align' macro based on the
+  architecture macro and gating the ragged tail code to do a word
+  copy, or affecting the bulk section to do one more loop -- or having
+  a differnt block copy an bytes_reversed block copy that loops on
+  words.
diff --git "a/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org" "b/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org"
new file mode 100644
index 0000000..2081b61
--- /dev/null
+++ "b/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org"
@@ -0,0 +1,88 @@
+#+TITLE: Exclusive vs. Inclusive Bounds
+#+AUTHOR: Thomas & Eidolon
+#+DATE: 2025-02-28
+#+OPTIONS: toc:nil
+
+* Introduction
+The discussion explores the advantages and drawbacks of exclusive and inclusive bounds in programming, with a particular focus on C-style memory intervals. Exclusive upper bounds have been a longstanding convention in C and derived languages but have also led to significant issues, including microprocessor bugs. The alternative, inclusive bounds, offers certain advantages, particularly in preventing out-of-bounds memory access.
+
+* Exclusive Bounds in C
+Exclusive bounds mean that the upper bound points to one element past the actual interval. This approach aligns with C's idioms and iteration patterns:
+
+- A pointer iterated through memory will naturally stop when it equals the upper bound.
+- The memory length itself matches the upper bound minus the lower bound.
+
+However, this has caused notable problems:
+- The upper bound address may not be representable within the allocated memory range.
+- In hardware, this can lead to prefetching errors, page faults, and potential security issues due to speculative execution.
+- In software, off-by-one errors frequently arise when handling array lengths and loops.
+
+A personal anecdote from AMD illustrates the severity of this issue: speculative execution could cause processors to prefetch addresses that lay outside valid memory pages, leading to processor bugs that were not initially acknowledged.
+
+* Inclusive Bounds: A Cleaner Approach?
+Inclusive bounds, in contrast, place the upper bound within the interval, reducing the risk of out-of-bounds memory accesses. Some advantages include:
+
+- The highest valid index remains within the range of representable values.
+- Iteration tests for `>` rather than `==`, which is often more intuitive.
+- Eliminates off-by-one errors associated with exclusive bounds.
+
+This approach is used in some hardware and computational models, as detailed in TTCA&#8203;:contentReference[oaicite:0]{index=0}. The book advocates for extent-based indexing rather than length-based indexing to ensure safe iteration patterns.
+
+* The Boundary Issue in Inclusive Upper Bounds
+
+While inclusive bounds provide intuitive indexing and prevent off-by-one errors, they introduce a fundamental issue when dealing with **typed memory operations**, such as bulk copying or aligned processing.
+
+- A **pointer to the first byte** of an interval is **also a pointer to the first word**, making inclusive lower bounds type-agnostic.
+- However, a **pointer to the last byte** is **not** a pointer to the last wordâit is merely the last address in the range.
+- In contrast, an **exclusive upper bound** remains type-agnostic, as it represents an address just past the valid range, which works independently of element size.
+
+### **Why This Matters for Bulk Operations**
+Memory operations often process data in **word-sized chunks** for efficiency:
+- Copying memory in **aligned 64-bit words** requires knowing where the last valid word begins.
+- With **exclusive bounds**, this is straightforward: iteration stops at the upper bound.
+- With **inclusive bounds**, adjustments (`-8` for 64-bit words) become necessary to avoid overstepping.
+
+### **Key Takeaways**
+- **Inclusive lower bounds remain universally valid** and do not require type knowledge.
+- **Inclusive upper bounds require type knowledge**, which can be inefficient or unsafe.
+- **Exclusive upper bounds** naturally align with word-based processing, reducing extra adjustments.
+
+This suggests that **a hybrid approach**âinclusive lower bounds with exclusive upper boundsâmay provide the best balance of **safety and efficiency**, particularly in systems with **low-level memory operations**.
+
+* Implementation Considerations
+Converting from exclusive to inclusive bounds is not trivial, as many established languages and libraries assume exclusive bounds. Some challenges include:
+
+- Existing APIs and standard libraries expect exclusive bounds, requiring additional adjustments.
+- Iteration logic must be adapted to use `<=` instead of `<` in many cases.
+- Some optimizations, such as using pointer arithmetic with exclusive bounds, may require rethinking.
+
+
+* Conclusion
+The choice between exclusive and inclusive bounds is not merely a stylistic one but has real implications for safety, correctness, and performance. While exclusive bounds remain dominant in C-derived languages, inclusive bounds eliminate a class of potential errors and are often preferable when designing new architectures or computational models.
+
+For TTCA, inclusive bounds were chosen specifically to prevent the issues that arise from exclusive bounds&#8203;:contentReference[oaicite:1]{index=1}. Future discussions may explore the feasibility of transitioning software ecosystems toward inclusive bounds or at least providing safer abstractions to mitigate the risks of exclusive bounds.
+
+* References
+- Thomas, *Tom's Turing Complete Computing Architecture (TTCA)*&#8203;:contentReference[oaicite:2]{index=2}.
+------------------------
+Hereâs what stands out from this process:
+
+Inclusive lower bounds work cleanly
+
+The starting address is always valid, so thereâs no need for adjustments.
+This makes iteration straightforward without additional logic.
+Inclusive upper bounds require adjustments in word-based operations
+
+A pointer to the last byte is not a pointer to the last word.
+This forces explicit alignment corrections (& ~0x7 style masking).
+These corrections introduce additional computation (-8, +1, conditionals).
+Exclusive upper bounds simplify bulk memory operations
+
+If the loop simply runs while ptr < end, the word-aligned processing works naturally.
+No need for extra adjustments before entering the bulk copy loop.
+This aligns with how hardware and assembly-based operations work.
+Hybrid Approach Might Be the Best Path
+
+Inclusive lower bounds keep indexing intuitive.
+Exclusive upper bounds avoid unnecessary adjustments in word-based processing.
+This mirrors how C and assembly tend to handle memory intervals (start inclusive, end exclusive).