From: Thomas Walker Lynch Date: Sat, 1 Mar 2025 10:35:43 +0000 (+0000) Subject: checkpoint before unifying identity and reverse_order X-Git-Url: https://git.reasoningtechnology.com/style/static/gitweb.css?a=commitdiff_plain;h=ead8eff707d7a5085ee98c41343d2c5f22c7c236;p=N checkpoint before unifying identity and reverse_order --- diff --git "a/developer/cc\360\237\226\211/Copy.lib.c" "b/developer/cc\360\237\226\211/Copy.lib.c" index 4e40437..0db4d42 100644 --- "a/developer/cc\360\237\226\211/Copy.lib.c" +++ "b/developer/cc\360\237\226\211/Copy.lib.c" @@ -2,6 +2,7 @@ Copy - Memory copy operations with attention to alignment. Provides optimized copy and byte order reversal functions. + 'ATP' At This Point in the code. Assertions follow. */ #define Copy·DEBUG @@ -20,27 +21,50 @@ #include #include + #define extentof(x) (sizeof(x) - 1) + #define extent_t size_t + typedef struct{ void *read0; - size_t read_size; + extent_t read_extent; void *write0; - size_t write_size; - } Copy·it; + extent_t write_extent; + } Copy·It; + + typedef enum{ + Copy·It·Status·valid = 0 + ,Copy·It·Status·null_read + ,Copy·It·Status·null_write + ,Copy·It·Status·overlap + } Copy·It·Status; typedef enum{ - Copy·Status·perfect_fit = 0 - ,Copy·Status·argument_guard - ,Copy·Status·read_surplus - ,Copy·Status·read_surplus_write_gap - ,Copy·Status·write_available - ,Copy·Status·write_gap // write allocation has a terminal gap + Copy·Step·perfect_fit = 0 + ,Copy·Step·argument_guard + ,Copy·Step·read_surplus + ,Copy·Step·read_surplus_write_gap + ,Copy·Step·write_available + ,Copy·Step·write_gap } Copy·Status; typedef struct{ - void *region(void *read0 ,void *read1 ,void *write0); + bool Copy·IntervalPts·in(void *pt, void *pt0 ,void *pt1); + bool Copy·IntervalPts·contains(void *pt00 ,void *pt01 ,void *pt10 ,void *pt11); + bool Copy·IntervalPts·overlap(void *pt00 ,void *pt01, void *pt10 ,void *pt11); + + bool Copy·IntervalPtSize·in(void *pt, void *pt0 ,size_t s); + bool Copy·IntervalPtSize·overlap(void *pt00 ,size_t s0, void *pt10 ,size_t s1); + + Copy·It·Status Copy·wellformed_it(Copy·It *it) + + void *identity(void *read0 ,void *read1 ,void *write0); void *reverse_byte_order(void *read0 ,void *read1 ,void *write0); - } Copy·M; + Copy·Status Copy·Step·identity(Copy·It *it); + Copy·Status Copy·Step·reverse_order(Copy·It *it); + Copy·Status Copy·Step·write_hex(Copy·It *it); + Copy·Status Copy·Step·read_hex(Copy·It *it); + } Copy·M; #endif @@ -49,323 +73,320 @@ #ifdef Copy·IMPLEMENTATION - // this part goes into Nlib.a + #ifdef Copy·DEBUG + #include + #endif + + // this part goes into Copylib.a + // yes this is empty, so there is no Copylib.a #ifndef LOCAL #endif #ifdef LOCAL + // Interval predicates. + // Intervals in Copy have inclusive bounds + Local bool Copy·IntervalPts·in(void *pt, void *pt0 ,void *pt1){ + return pt >= pt0 && pt <= pt1; // Inclusive bounds + } + Local bool Copy·in_extent_interval(void *pt, void *pt0 ,extent_t e){ + return Copy·IntervalPts·in(pt ,pt0 ,pt0 + e); + } - #ifdef Copy·DEBUG - #include // Only for debug prints, not used in production. + // interval 0 contains interval 1, overlap on boundaries allowed. + Local bool Copy·IntervalPts·contains( + void *pt00 ,void *pt01 ,void *pt10 ,void *pt11 + ){ + return pt10 >= pt00 && pt11 <= pt01; + } -typedef enum{ - Copy·StatusWFIt·none = 0x00 - ,Copy·StatusWFIt·null_read = 0x01 - ,Copy·StatusWFIt·null_write = 0x02 - ,Copy·StatusWFIt·zero_read_size = 0x04 - ,Copy·StatusWFIt·zero_write_size = 0x08 - ,Copy·StatusWFIt·write_too_small_hex = 0x10 - ,Copy·StatusWFIt·read_too_small_hex = 0x20 - ,Copy·StatusWFIt·read_larger_than_write = 0x40 - ,Copy·StatusWFIt·overlapping_buffers = 0x80 -} Copy·StatusWFIt; + // interval 0 properly contains interval 1, overlap on boundaries not allowed. + Local bool Copy·contains_proper_pt_interval( + void *pt00 ,void *pt01 ,void *pt10 ,void *pt11 + ){ + return pt10 > pt00 && pt11 < pt01; + } -typedef enum{ - Copy·ModeWFIt·none = 0x00 - ,Copy·ModeWFIt·bytes = 0x01 - ,Copy·ModeWFIt·reverse = 0x02 - ,Copy·ModeWFIt·write_hex = 0x03 - ,Copy·ModeWFIt·from_hex = 0x04 -} Copy·ModeWFIt; + // Possible cases of overlap, including just touching + // 1. interval 0 to the right of interval 1, just touching p00 == p11 + // 2. interval 0 to the left of interval 1, just touching p01 == p10 + // 3. interval 0 wholly contained in interval 1 + // 4. interval 0 wholly contains interval 1 + Local bool Copy·IntervalPts·overlap(void *pt00 ,void *pt01, void *pt10 ,void *pt11){ + return + Copy·IntervalPts·in(pt00 ,pt10 ,pt11) // #1, #3 + || Copy·IntervalPts·in(pt10 ,pt00 ,pt01) // #2, #4 + ; + } + Local bool Copy·overlap_extent_interval(void *pt00 ,extent_t e0, void *pt10 ,extent_t e1){ + return Copy·IntervalPts·overlap(pt00 ,pt00 + e0 ,pt10 ,pt10 + e1); + } + Local Copy·It·Status Copy·It·wellformed(Copy·It *it){ + char *this_name = "Copy·It·wellformed"; + Copy·It·Status status = Copy·It·Status·valid; + if(it->read0 == NULL){ + fprintf(stderr, "%s: NULL read pointer\n", this_name); + status |= Copy·It·Status·null_read; + } -#endif + if(it->write0 == NULL){ + fprintf(stderr, "%s: NULL write pointer\n", this_name); + status |= Copy·It·Status·null_write; + } + if( + Copy·overlap_extent_interval(it->read0 ,it->read_extent ,it->write0 ,it->write_extent) + ){ + fprintf(stderr, "%s: Read and write buffers overlap!\n", this_name); + status |= Copy·It·Status·overlap; + } - /* - Copy·region - Copies a memory region while preserving byte order. - - Aligns reads for performance. - - Writes are assumed to be buffered and do not require alignment. - - Returns the updated write pointer. - */ - Local void *Copy·bytes(void *read0 ,void *read1 ,void *write0){ + return status; + } - uint8_t *r = (uint8_t *)read0; - uint8_t *r1 = (uint8_t *)read1; - uint8_t *w = (uint8_t *)write0; + // consider an 8 byte window that is aligned + // returns the byte pointer to the least address byte in the window + Local void *Copy·floor_64(void *p){ + return (uintptr_t)p & ~(uintptr_t)0x7; + } - //---------- - // The potentially unaligned initial part (align read pointer). - if( (uintptr_t)r & 0x7 ){ - - // ORing in `0x7` adds at most six bytes to r. - uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7); - - // If the read interval is very small - if(r01 >= r1){ - while(r < r1){ - *w++ = *r++; - } - return w; - } - - // Copy up to alignment boundary - do{ - *w++ = *r++; - }while(r <= r01); - } - // r is now aligned, but *r has not yet been copied + // consider an 8 byte window that is aligned + // returns the byte pointer to the greatest address byte in the window + Local void *Copy·ceiling_64(void *p){ + return (uintptr_t)p | 0x7; + } - //---------- - // The bulk copy part (w is still possibly unaligned, but r is aligned) - uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7); + // byte array greatest address byte at p1 (inclusive) + // byte array least address byte at p0 (inclusive) + // returns pointer to the greatest full 64-bit word-aligned address that is ≤ p1 + // by contract, p1 must be >= p0 + Local uint64_t *Copy·greatest_full_64(void *p0 ,void *p1){ - while(r < r10){ - *(uint64_t *)w = *(uint64_t *)r; - w += 8; - r += 8; - } + // If p1 - 0x7 moves into a prior word while p0 does not, a prefetch hazard can occur. + // If p1 and p0 are more than 0x7 apart, they cannot be in the same word, + // but this does not guarantee a full 64-bit word exists in the range. + if (p1 - p0 < 0x7) return NULL; - // If r1 was aligned then r10 == r1 and we are done - if(r == r1) return w; + // Compute the last fully aligned word at or before p1. + uint64_t *p1_64 = (void *)( ((uintptr_t)p1 - 0x7) & ~(uintptr_t)0x7 ); - //---------- - // The ragged tail, up to 7 bytes - do{ - *w++ = *r++; - }while(r < r1); + // If alignment rounds p1_64 below p0, there is no full word available. + if(p1_64 < p0) return NULL; - return w; + return p1_64; } - /* - Copy·reverse_byte_order - Copies a memory region while reversing byte order. - - Reads from read1 down - - writes from write0 up - - Uses `__builtin_bswap64` for efficient 64-bit swaps. - - Returns the updated write pointer. - */ - Local void *Copy·bytes_reverse_order(void *read0 ,void *read1 ,void *write0){ + // byte array greatest address byte at p1 (inclusive) + // byte array least address byte at p0 (inclusive) + // returns pointer to the least full 64-bit word-aligned address that is ≥ p0 + Local uint64_t *Copy·least_full_64(void *p0 ,void *p1){ + + // If p0 + 0x7 moves into the next word while p1 does not, a prefetch hazard can occur. + // If p1 and p0 are more than 0x7 apart, they cannot be in the same word, + // but this does not guarantee a full 64-bit word exists in the range. + if(p1 - p0 < 0x7) return NULL; + + // Compute the first fully aligned word at or after p0. + uint64_t *p0_64 = (void *)( ((uintptr_t)p0 + 0x7) & ~(uintptr_t)0x7 ); + + // If alignment rounds p0_64 beyond p1, there is no full word available. + if(p0_64 > p1) return NULL; + + return p0_64; + } + + Local void *Copy·inc64(void *p ,size_t Δ){ + return (void *)((uint64_t *)p) + Δ; + } + + Local void *Copy·identity(void *read0 ,void *read1 ,void *write0){ + + //---------------------------------------- + // argument guard + + if(read1 < read0) return NULL; + // ATP there is at least one byte to be copied + + //---------------------------------------- + // features of the byte arrays, optimizer should move this code around - uint8_t *r = (uint8_t *)read1; // Start from the last byte uint8_t *r0 = (uint8_t *)read0; - uint8_t *w = (uint8_t *)write0; + uint8_t *r1 = (uint8_t *)read1; // inclusive upper bound + uint8_t *w0 = (uint8_t *)write0; - //---------- - // The potentially unaligned initial part (align read pointer). - if( (uintptr_t)r & 0x7 ){ + uint8_t *r = r0; + uint8_t *w = w0; - // ANDing with `~0x7` moves it downward to the nearest lower alignment. - uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7); + // the contained uint64_t array + uint64_t *r0_64 = Copy·least_full_64(r0 ,r1); + uint64_t *r1_64 = Copy·greatest_full_64(r0 ,r1); - // If the read interval is very small - if(r10 < r0){ - while(r > r0){ - *w++ = *--r; - } - return w; - } + // ATP there might be unaligned smallest address in the array bytes - // Copy down to alignment boundary + // In fact, r0_64 and r1_64 being NULL will always occur together. + // .. then there are not many bytes to be copied + if(r0_64 == NULL || r1_64 == NULL){ do{ - *w++ = *--r; - }while(r > r10); + *w = *r; + if(r == r1) break; + w++; + r++; + }while(true); + return w; } - // r is now aligned, and *r has been copied - - //---------- - // The bulk copy part - uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7); - while(r > r01){ - r -= 8; - *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r); - w += 8; + // if needed, align r + while(r < r0_64){ + *w++ = *r++; } + // ATP r == r0_64, though *r has not yet been copied + // ATP r is uint64_t aligned + // ATP there is at least one word to be copied + // ATP w is possibly not aligned - // If r0 was aligned then r01 == r0 and we are done - if(r < r0) return w; + //---------------------------------------- + // The bulk copy part + + do{ + *(uint64_t *)w = *(uint64_t *)r; + if(r == r1_64) break; + w = Copy·inc64(w ,1); + r = Copy·inc64(r ,1); + }while(true); + // ATP r == r1_64 + + // If r1 was aligned the copy is done + bool aligned_r1 = (uintptr_t)r1 == (uintptr_t)Copy·ceiling_64(r1_64); + if(aligned_r1) return w; + r = Copy·inc_64(r ,1); + w = Copy·inc_64(w ,1); + // ATP there is at least one trailing unaligned byte to copy + // *r has not yet been copied, but needs to be //---------- // The ragged tail, up to 7 bytes do{ - *w++ = *--r; - }while(r >= r0); + *w = *r; + if(r == r1) break; + w++; + r++; + }while(true); return w; } - /* - Read buffer is read from the lowest address, working toward higher addresses. + Local void *Copy·reverse_byte_order(void *read0 ,void *read1 ,void *write0){ - Write buffer is written from the lowest address, working to higher addresses. + //---------------------------------------- + // Argument guard - To force data to be left in the read buffer, or for capacity to be left in the - write buffer, reduce sizes. - */ - Local Copy·Status Copy·step( - Copy·it *it - ){ - uint8_t *r = (uint8_t *)it->read0; - uint8_t *w = (uint8_t *)it->write0; + if(read1 < read0) return NULL; + // ATP there is at least one byte to be copied - size_t rs = it->read_size; - size_t ws = it->write_size; - - if(ws >= rs){ - Copy·bytes(r ,r + rs ,w); - it->read0 += rs; - it->read_size = 0; - it->write0 += rs; - it->write_size -= rs; - if(ws == rs) return Copy·Status·perfect_fit; - return Copy·Status·write_available;; - } + //---------------------------------------- + // Features of the byte arrays, optimizer should move this code around - // ws < rs - Copy·bytes(r ,r + ws ,w); - it->read0 += ws; - it->read_size -= ws; - it->write_size = 0; - it->write0 += ws; - return Copy·Status·read_surplus; - } - - /* - Read buffer is read from top down. Start with the largest address - just above the read buffer. Continue into lower addresses. - - write buffer is written from bottom up. Start with the lowest address, - continue into higher addresses. - */ - Local Copy·Status Copy·step_reverse_order(Copy·it *it){ - // How many bytes remain to be read/written - if( it->read_size == 0) return Copy·Status·complete; - size_t rs = it->read_size; - uint8_t *r1 = (uint8_t *)it->read0 + rs; - size_t ws = it->write_size; - uint8_t *w0 = (uint8_t *)it->write0; - - if(ws >= rs){ - uint8_t *r0 = (uint8_t *)it->read0; - Copy·bytes_reverse_order(r0, r1, w0); - it->read_size = 0; - it->write0 += rs; - it->write_size -= rs; - if(it->write_size == 0) return Copy·Status·perfect_fit; - return Copy·Status·write_available; - } + uint8_t *r0 = (uint8_t *)read0; + uint8_t *r1 = (uint8_t *)read1; // inclusive upper bound + uint8_t *w0 = (uint8_t *)write0; - // ws < rs - uint8_t *r0 = r1 - ws; - Copy·bytes_reverse_order(r0, r1, w0); - it->read0 -= ws; - it->read_size -= ws; - it->write_size = 0; - it->write0 += ws; - return Copy·Status·read_surplus; - } + uint8_t *r = r1; // Start from the last byte + uint8_t *w = w0; - /* - Read bytes, write hex pairs. - Read and write are low address to high address. - Each read byte value -> 2 write allocation bytes - */ - Local Copy·Status Copy·step_write_hex( - Copy·it *it - ){ + // The contained uint64_t array + uint64_t *r0_64 = Copy·least_full_64(r0 ,r1); + uint64_t *r1_64 = Copy·greatest_full_64(r0 ,r1); - uint8_t *r = (uint8_t *)it->read0; - size_t rs = it->read_size; + // ATP there might be unaligned highest address in the array bytes - uint8_t *w = (uint8_t *)it->write0; - size_t ws = it->write_size & ~1; // even number write_size - size_t ews = it->write_size >> 1; // effective write size - - // If ews >= rs, read bytes all coped - if(ews >= rs){ - size_t ers = it->read_size << 1; // effective read size - it->write0 += ers; - it->write_size -= ers; - while(rs--){ - *(uint16_t *)w = Copy·byte_to_hex(*r++); - w += 2; - } - it->read0 = r; - it->read_size = 0; - - if(it->write_size == 0) return Copy·Status·perfect_fit; - if(it->write_size == 1) return Copy·Status·write_gap; - return Copy·Status·write_available; + // If no full words exist, fallback to byte-wise copying + if(r0_64 == NULL || r1_64 == NULL){ + do{ + *w = *r; + if(r == r0) break; + w++; + r--; + }while(true); + return w; } - // ews < rs, write allocation all used, read bytes surplus - it->read0 += ews; - it->read_size -= ews; - while(ews--){ - *(uint16_t *)w = Copy·byte_to_hex(*r++); - w += 2; + // If needed, align r + while(r > (uint8_t *)r1_64){ + *w++ = *r--; } - it->write0 = w; - it->write_size -= ws; + // ATP r == r1_64, though *r has not yet been copied + // ATP r is uint64_t aligned + // ATP there is at least one word to be copied + // ATP w is possibly not aligned + + //---------------------------------------- + // The bulk copy part + + do{ + *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r); + if(r == r0_64) break; + w = Copy·inc64(w ,1); + r = Copy·inc64(r ,-1); + }while(true); + // ATP r == r0_64 + + // If r0 was aligned, the copy is done + bool aligned_r0 = (uintptr_t)r0 == (uintptr_t)Copy·floor_64(r0_64); + if(aligned_r0) return w; + + r = Copy·inc64(r ,-1); + w = Copy·inc64(w ,1); + // ATP there is at least one trailing unaligned byte to copy + // *r has not yet been copied, but needs to be + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w = *r; + if(r == r0) break; + w++; + r--; + }while(true); - if(it->write_size == 1) return Copy·Status·read_surplus_write_gap; - return Copy·Status·read_surplus; + return w; } + /* - Read hex pairs, write bytes. - Read is low address to high address. - Write is low address to high address. - Each read hex pair -> 1 write byte. + Read and write pointers are incremented by `extent + 1`, ensuring they do not skip + past the last valid byte. The previous `+1` was incorrect in cases where + stepping already processed the last byte. */ - Local Copy·Status Copy·step_from_hex( - Copy·it *it - ){ + Local Copy·Status Copy·Step·identity(Copy·It *it){ uint8_t *r = (uint8_t *)it->read0; - size_t rs = it->read_size & ~1; // Must be even for hex pairs. - size_t ers = rs >> 1; // Effective read size: half the number of bytes. - uint8_t *w = (uint8_t *)it->write0; - size_t ws = it->write_size; // Write size already in bytes. - - // If ws >= ers, all hex values are processed - if(ws >= ers){ - while(ers--){ - *w++ = Copy·hex_to_byte(*(uint16_t *)r); - r += 2; - } - - it->read0 = r; - it->read_size -= rs; - it->write0 = w; - it->write_size -= rs >> 1; // Each byte consumes two hex chars. - - if(it->write_size == 0) return Copy·Status·perfect_fit; - return Copy·Status·write_available; - } - // ws < ers, read allocation surplus - while(ws--){ - *w++ = Copy·hex_to_byte(*(uint16_t *)r); - r += 2; + extent_t re = it->read_extent; + extent_t we = it->write_extent; + + if(we >= re){ + Copy·bytes(r ,r + re ,w); + it->read0 += re; // Fixed stepping logic + it->read_extent = 0; + it->write0 += re; + it->write_extent -= re; + if(we == re) return Copy·Step·perfect_fit; + return Copy·Step·write_available; } - it->read0 = r; - it->read_size -= ws << 1; // Each write byte consumes two hex chars. - it->write0 = w; - it->write_size = 0; - - return Copy·Status·read_surplus; + Copy·bytes(r ,r + we ,w); + it->read0 += we; // Fixed stepping logic + it->read_extent -= we; + it->write_extent = 0; + it->write0 += we; + return Copy·Step·read_surplus; } - #endif // LOCAL - #endif // IMPLEMENTATION diff --git "a/developer/cc\360\237\226\211/update.lib.c" "b/developer/cc\360\237\226\211/update.lib.c" new file mode 100644 index 0000000..61c5cf4 --- /dev/null +++ "b/developer/cc\360\237\226\211/update.lib.c" @@ -0,0 +1,107 @@ +/* + Copy - Memory copy operations with attention to alignment. + Provides optimized copy and byte order reversal functions. +*/ + +#define Copy·DEBUG + +#ifndef FACE +#define Copy·IMPLEMENTATION +#define FACE +#endif + +//-------------------------------------------------------------------------------- +// Interface + +#ifndef Copy·FACE +#define Copy·FACE + + #include + #include + + typedef struct{ + void *read0 + ,size_t read_size + ,void *write0 + ,size_t write_size; + } Copy·it; + + typedef enum{ + Copy·Status·perfect_fit = 0 + ,Copy·Status·argument_guard + ,Copy·Status·read_surplus + ,Copy·Status·read_surplus_write_gap + ,Copy·Status·write_available + ,Copy·Status·write_gap; + } Copy·Status; + + typedef enum{ + Copy·WFIt·Mode·none = 0 + ,Copy·WFIt·Mode·bytes + ,Copy·WFIt·Mode·bytes_reverse + ,Copy·WFIt·Mode·write_hex + ,Copy·WFIt·Mode·read_hex; + } Copy·WFIt·Mode; + + typedef enum{ + Copy·WFIt·Status·valid = 0 + ,Copy·WFIt·Status·null_read + ,Copy·WFIt·Status·null_write + ,Copy·WFIt·Status·zero_buffer + ,Copy·WFIt·Status·overlap + ,Copy·WFIt·Status·write_too_small; + } Copy·WFIt·Status; + + typedef struct{ + void *region( void *read0 ,void *read1 ,void *write0 ) + ,void *reverse_byte_order( void *read0 ,void *read1 ,void *write0 ); + } Copy·M; + +#endif + +//-------------------------------------------------------------------------------- +// Implementation + +#ifdef Copy·IMPLEMENTATION + + // this part goes into Nlib.a + #ifndef LOCAL + #endif + + #ifdef LOCAL + + Local Copy·WFIt·Status Copy·wellformed_it(Copy·it *it){ + char *this_name = "Copy·wellformed_it"; + Copy·WFIt·Status status = Copy·WFIt·Status·valid; + + if(it->read0 == NULL){ + fprintf( stderr ,"%s: NULL read pointer\n" ,this_name ); + status |= Copy·WFIt·Status·null_read; + } + + if(it->write0 == NULL){ + fprintf( stderr ,"%s: NULL write pointer\n" ,this_name ); + status |= Copy·WFIt·Status·null_write; + } + + if(it->read_size == 0){ + fprintf( stderr ,"%s: Zero-sized read buffer\n" ,this_name ); + status |= Copy·WFIt·Status·zero_read_buffer; + } + + if(it->write_size == 0){ + fprintf( stderr ,"%s: Zero-sized write buffer\n" ,this_name ); + status |= Copy·WFIt·Status·zero_write_buffer; + } + + if( Copy·overlap_size_interval(it->read0 ,it->read_size ,it->write0 ,it->write_size) ){ + fprintf( stderr ,"%s: Read and write buffers overlap!\n" ,this_name ); + status |= Copy·WFIt·Status·overlap; + } + + return status; + } + + #endif // LOCAL + +#endif // IMPLEMENTATION diff --git "a/developer/deprecated\360\237\226\211/Copy.lib.c" "b/developer/deprecated\360\237\226\211/Copy.lib.c" new file mode 100644 index 0000000..4ea9ce5 --- /dev/null +++ "b/developer/deprecated\360\237\226\211/Copy.lib.c" @@ -0,0 +1,424 @@ +/* + Copy - Memory copy operations with attention to alignment. + Provides optimized copy and byte order reversal functions. + +*/ + +#define Copy·DEBUG + +#ifndef FACE +#define Copy·IMPLEMENTATION +#define FACE +#endif + +//-------------------------------------------------------------------------------- +// Interface + +#ifndef Copy·FACE +#define Copy·FACE + + #include + #include + + #define extentof(x) (sizeof(x)-1) + + typedef struct{ + void *read0; + size_t read_size; + void *write0; + size_t write_size; + } Copy·it; + + // returned from the `step_X` functions + typedef enum{ + Copy·Step·perfect_fit = 0 + ,Copy·Step·argument_guard + ,Copy·Step·read_surplus + ,Copy·Step·read_surplus_write_gap + ,Copy·Step·write_availableCopy·Status· + ,Copy·Step·write_gap; + } Copy·Status; + + typedef enum{ + Copy·WFIt·Status·valid = 0 + ,Copy·WFIt·Status·null_read + ,Copy·WFIt·Status·zero_size_read + ,Copy·WFIt·Status·null_write + ,Copy·WFIt·Status·zero_size_write + ,Copy·WFIt·Status·overlap + } Copy·WFIt·Status; + + // function dictionary + typedef struct{ + void *bytes(void *read0 ,void *read1 ,void *write0); + void *reverse_byte_order(void *read0 ,void *read1 ,void *write0); + Copy·WFIt·Status Copy·wellformed_it(Copy·it *it ,Copy·WFIt·Mode mode); + } Copy·M; + +#endif + +//-------------------------------------------------------------------------------- +// Implementation + +#ifdef Copy·IMPLEMENTATION + + #ifdef Copy·DEBUG + #include // Only for debug prints, not used in production. + #endif + + + // this part goes into Copylib.a + // yes this is empty, so there is no Copylib.a + #ifndef LOCAL + #endif + + #ifdef LOCAL + + // Interval predicates. + // Intervals in Copy have an exclusive upper bound + + Local bool Copy·in_pt_interval(void *pt, void *pt0 ,void *pt1){ + return pt >= pt0 && pt < pt1; + } + Local bool Copy·in_size_interval(void *pt, void *pt0 ,size_t s){ + return Copy·in_pt_interval(pt ,pt0 ,pt0 + s); + } + + // interval 0 contains interval 1, overlap on boundaries allowed. + Local bool Copy·contains_pt_interval( + void *pt00 ,void *pt01 ,void *pt10 ,void *pt11 + ){ + return + pt10 >= pt00 && pt11 <= pt01 + ; + } + + // Possible cases of overlap + // 1. interval 0 to the left of interval 1 + // 2. interval 0 to the right of interval 1 + // 3. interval 0 wholly contained in interval 1 + // 4. interval 0 wholly contains interval 1 + Local bool Copy·overlap_pt_interval(void *pt00 ,void *pt01, void *pt10 ,void *pt11){ + void *pt01_inclusive = pt01 - 1; + void *pt11_inclusive = pt11 - 1; + return + Copy·in_pt_interval(pt10 ,pt00 ,pt01) // #1, #4 + || + Copy·in_pt_interval(pt00 ,pt10 ,pt11) // #2, #3 + ; + } + Local bool Copy·overlap_size_interval(void *pt00 ,size_t s0, void *pt10 ,size_t s1){ + return Copy·overlap_pt_interval(pt00 ,pt00 + s0 ,pt10 ,pt10 + s1); + } + + Local Copy·WFIt·Status Copy·wellformed_it(Copy·it *it){ + char *this_name = "Copy·wellformed_it"; + Copy·WFIt·Status status = Copy·WFIt·Status·valid; + + if(it->read0 == NULL){ + fprintf( stderr ,"%s: NULL read pointer\n" ,this_name ); + status |= Copy·WFIt·Status·null_read; + } + + if(it->write0 == NULL){ + fprintf( stderr ,"%s: NULL write pointer\n" ,this_name ); + status |= Copy·WFIt·Status·null_write; + } + + if(it->read_size == 0){ + fprintf( stderr ,"%s: Zero-sized read buffer\n" ,this_name ); + status |= Copy·WFIt·Status·zero_read_buffer; + } + + if(it->write_size == 0){ + fprintf( stderr ,"%s: Zero-sized write buffer\n" ,this_name ); + status |= Copy·WFIt·Status·zero_write_buffer; + } + + if( Copy·overlap_size_interval(it->read0 ,it->read_size ,it->write0 ,it->write_size) ){ + fprintf( stderr ,"%s: Read and write buffers overlap!\n" ,this_name ); + status |= Copy·WFIt·Status·overlap; + } + + return status; + } + + /* + Identity function. read interval values are copied without modification of value + or order to the write allocation. + - Aligns reads for performance. + - Writes are assumed to be buffered and do not require alignment. + - Returns the updated write pointer. + - See doc 'Copy.org' for more details. + */ + Local void *Copy·identity(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read0; + uint8_t *r1 = (uint8_t *)read1; + uint8_t *w = (uint8_t *)write0; + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 ){ + + // at this point r == r0, the lower bound of the read interval + // r0 | `0x7` adds at most six bytes to r. + uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7); + + // If the read interval is very small + if(r01 >= r1){ + while(r < r1){ + *w++ = *r++; + } + return w; + } + + // Copy up to alignment boundary + do{ + *w++ = *r++; + }while(r <= r01); + } + // r is now aligned, but *r has not yet been copied + + //---------- + // The bulk copy part (w is still possibly unaligned, but r is aligned) + uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7); + + while(r < r10){ + *(uint64_t *)w = *(uint64_t *)r; + w += 8; + r += 8; + } + + // If r1 was aligned then r10 == r1 and we are done + if(r == r1) return w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *r++; + }while(r < r1); + + return w; + } + + /* + Copy·reverse_byte_order - Copies a memory region while reversing byte order. + - Reads from read1 down + - writes from write0 up + - Uses `__builtin_bswap64` for efficient 64-bit swaps. + - Returns the updated write pointer. + */ + Local void *Copy·bytes_reverse_order(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read1; // Start from the last byte + uint8_t *r0 = (uint8_t *)read0; + uint8_t *w = (uint8_t *)write0; + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 ){ + + // ANDing with `~0x7` moves it downward to the nearest lower alignment. + uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7); + + // If the read interval is very small + if(r10 < r0){ + while(r > r0){ + *w++ = *--r; + } + return w; + } + + // Copy down to alignment boundary + do{ + *w++ = *--r; + }while(r > r10); + } + // r is now aligned, and *r has been copied + + //---------- + // The bulk copy part + uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7); + + while(r > r01){ + r -= 8; + *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r); + w += 8; + } + + // If r0 was aligned then r01 == r0 and we are done + if(r < r0) return w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *--r; + }while(r >= r0); + + return w; + } + + /* + Read buffer is read from the lowest address, working toward higher addresses. + + Write buffer is written from the lowest address, working to higher addresses. + + To force data to be left in the read buffer, or for capacity to be left in the + write buffer, reduce sizes. + */ + Local Copy·Status Copy·step(Copy·it *it){ + uint8_t *r = (uint8_t *)it->read0; + uint8_t *w = (uint8_t *)it->write0; + + size_t rs = it->read_size; + size_t ws = it->write_size; + + if(ws >= rs){ + Copy·bytes(r ,r + rs ,w); + it->read0 += rs; + it->read_size = 0; + it->write0 += rs; + it->write_size -= rs; + if(ws == rs) return Copy·Step·perfect_fit; + return Copy·Step·write_available;; + } + + // ws < rs + Copy·bytes(r ,r + ws ,w); + it->read0 += ws; + it->read_size -= ws; + it->write_size = 0; + it->write0 += ws; + return Copy·Step·read_surplus; + } + + /* + Read buffer is read from top down. Start with the largest address + just above the read buffer. Continue into lower addresses. + + write buffer is written from bottom up. Start with the lowest address, + continue into higher addresses. + */ + Local Copy·Status Copy·step_reverse_order(Copy·it *it){ + // How many bytes remain to be read/written + if( it->read_size == 0) return Copy·Step·complete; + size_t rs = it->read_size; + uint8_t *r1 = (uint8_t *)it->read0 + rs; + size_t ws = it->write_size; + uint8_t *w0 = (uint8_t *)it->write0; + + if(ws >= rs){ + uint8_t *r0 = (uint8_t *)it->read0; + Copy·bytes_reverse_order(r0, r1, w0); + it->read_size = 0; + it->write0 += rs; + it->write_size -= rs; + if(it->write_size == 0) return Copy·Step·perfect_fit; + return Copy·Step·write_available; + } + + // ws < rs + uint8_t *r0 = r1 - ws; + Copy·bytes_reverse_order(r0, r1, w0); + it->read0 -= ws; + it->read_size -= ws; + it->write_size = 0; + it->write0 += ws; + return Copy·Step·read_surplus; + } + + /* + Read bytes, write hex pairs. + Read and write are low address to high address. + Each read byte value -> 2 write allocation bytes + */ + Local Copy·Status Copy·step_write_hex(Copy·it *it){ + + uint8_t *r = (uint8_t *)it->read0; + size_t rs = it->read_size; + + uint8_t *w = (uint8_t *)it->write0; + size_t ws = it->write_size & ~1; // even number write_size + size_t ews = it->write_size >> 1; // effective write size + + // If ews >= rs, read bytes all coped + if(ews >= rs){ + size_t ers = it->read_size << 1; // effective read size + it->write0 += ers; + it->write_size -= ers; + while(rs--){ + *(uint16_t *)w = Copy·byte_to_hex(*r++); + w += 2; + } + it->read0 = r; + it->read_size = 0; + + if(it->write_size == 0) return Copy·Step·perfect_fit; + if(it->write_size == 1) return Copy·Step·write_gap; + return Copy·Step·write_available; + } + + // ews < rs, write allocation all used, read bytes surplus + it->read0 += ews; + it->read_size -= ews; + while(ews--){ + *(uint16_t *)w = Copy·byte_to_hex(*r++); + w += 2; + } + it->write0 = w; + it->write_size -= ws; + + if(it->write_size == 1) return Copy·Step·read_surplus_write_gap; + return Copy·Step·read_surplus; + } + + /* + Read hex pairs, write bytes. + Read is low address to high address. + Write is low address to high address. + Each read hex pair -> 1 write byte. + */ + Local Copy·Status Copy·step_read_hex(Copy·it *it){ + uint8_t *r = (uint8_t *)it->read0; + size_t rs = it->read_size & ~1; // Must be even for hex pairs. + size_t ers = rs >> 1; // Effective read size: half the number of bytes. + + uint8_t *w = (uint8_t *)it->write0; + size_t ws = it->write_size; // Write size already in bytes. + + // If ws >= ers, all hex values are processed + if(ws >= ers){ + while(ers--){ + *w++ = Copy·hex_to_byte(*(uint16_t *)r); + r += 2; + } + + it->read0 = r; + it->read_size -= rs; + it->write0 = w; + it->write_size -= rs >> 1; // Each byte consumes two hex chars. + + if(it->write_size == 0) return Copy·Step·perfect_fit; + return Copy·Step·write_available; + } + + // ws < ers, read allocation surplus + while(ws--){ + *w++ = Copy·hex_to_byte(*(uint16_t *)r); + r += 2; + } + + it->read0 = r; + it->read_size -= ws << 1; // Each write byte consumes two hex chars. + it->write0 = w; + it->write_size = 0; + + return Copy·Step·read_surplus; + } + + + #endif // LOCAL + + +#endif // IMPLEMENTATION diff --git "a/developer/document\360\237\226\211/Copy.org" "b/developer/document\360\237\226\211/Copy.org" index 38b9729..6ab73ae 100644 --- "a/developer/document\360\237\226\211/Copy.org" +++ "b/developer/document\360\237\226\211/Copy.org" @@ -70,3 +70,24 @@ Copy·Status result = Copy·step(©_instance); This module provides an efficient and flexible memory copying framework suitable for low-level data manipulation tasks. +* Relevant todo note + +2025-02-24T08:09:57Z Copy.lib.c forced alginment machine might have issues with the block copy. + + The block copy aligns the read pointer by copying some initial + bytes. It ignores the alignment on the write pointer. Then at the end it does a byte + by byte copy of the ragged tail (less than a full word number of bytes). + + For a system that forces alignment, the initial alignment of the read pointer will get skipped. The write pointer will be aligned, so there is no problem in not checking it. + + However, the ragged tail loop can fire on a forced aligned + system. This will happen if the bounding read pointer passed in to + the block copy is not pointing to the first byte of a word. This can + happen if it is created adding `sizeof` of an object with that is not an even number + of bytes in a word long. + + The solution is probably to set a 'force align' macro based on the + architecture macro and gating the ragged tail code to do a word + copy, or affecting the bulk section to do one more loop -- or having + a differnt block copy an bytes_reversed block copy that loops on + words. diff --git "a/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org" "b/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org" new file mode 100644 index 0000000..2081b61 --- /dev/null +++ "b/document\360\237\226\211/Inclusive_Exclusive_interval_bounds.org" @@ -0,0 +1,88 @@ +#+TITLE: Exclusive vs. Inclusive Bounds +#+AUTHOR: Thomas & Eidolon +#+DATE: 2025-02-28 +#+OPTIONS: toc:nil + +* Introduction +The discussion explores the advantages and drawbacks of exclusive and inclusive bounds in programming, with a particular focus on C-style memory intervals. Exclusive upper bounds have been a longstanding convention in C and derived languages but have also led to significant issues, including microprocessor bugs. The alternative, inclusive bounds, offers certain advantages, particularly in preventing out-of-bounds memory access. + +* Exclusive Bounds in C +Exclusive bounds mean that the upper bound points to one element past the actual interval. This approach aligns with C's idioms and iteration patterns: + +- A pointer iterated through memory will naturally stop when it equals the upper bound. +- The memory length itself matches the upper bound minus the lower bound. + +However, this has caused notable problems: +- The upper bound address may not be representable within the allocated memory range. +- In hardware, this can lead to prefetching errors, page faults, and potential security issues due to speculative execution. +- In software, off-by-one errors frequently arise when handling array lengths and loops. + +A personal anecdote from AMD illustrates the severity of this issue: speculative execution could cause processors to prefetch addresses that lay outside valid memory pages, leading to processor bugs that were not initially acknowledged. + +* Inclusive Bounds: A Cleaner Approach? +Inclusive bounds, in contrast, place the upper bound within the interval, reducing the risk of out-of-bounds memory accesses. Some advantages include: + +- The highest valid index remains within the range of representable values. +- Iteration tests for `>` rather than `==`, which is often more intuitive. +- Eliminates off-by-one errors associated with exclusive bounds. + +This approach is used in some hardware and computational models, as detailed in TTCA​:contentReference[oaicite:0]{index=0}. The book advocates for extent-based indexing rather than length-based indexing to ensure safe iteration patterns. + +* The Boundary Issue in Inclusive Upper Bounds + +While inclusive bounds provide intuitive indexing and prevent off-by-one errors, they introduce a fundamental issue when dealing with **typed memory operations**, such as bulk copying or aligned processing. + +- A **pointer to the first byte** of an interval is **also a pointer to the first word**, making inclusive lower bounds type-agnostic. +- However, a **pointer to the last byte** is **not** a pointer to the last word—it is merely the last address in the range. +- In contrast, an **exclusive upper bound** remains type-agnostic, as it represents an address just past the valid range, which works independently of element size. + +### **Why This Matters for Bulk Operations** +Memory operations often process data in **word-sized chunks** for efficiency: +- Copying memory in **aligned 64-bit words** requires knowing where the last valid word begins. +- With **exclusive bounds**, this is straightforward: iteration stops at the upper bound. +- With **inclusive bounds**, adjustments (`-8` for 64-bit words) become necessary to avoid overstepping. + +### **Key Takeaways** +- **Inclusive lower bounds remain universally valid** and do not require type knowledge. +- **Inclusive upper bounds require type knowledge**, which can be inefficient or unsafe. +- **Exclusive upper bounds** naturally align with word-based processing, reducing extra adjustments. + +This suggests that **a hybrid approach**—inclusive lower bounds with exclusive upper bounds—may provide the best balance of **safety and efficiency**, particularly in systems with **low-level memory operations**. + +* Implementation Considerations +Converting from exclusive to inclusive bounds is not trivial, as many established languages and libraries assume exclusive bounds. Some challenges include: + +- Existing APIs and standard libraries expect exclusive bounds, requiring additional adjustments. +- Iteration logic must be adapted to use `<=` instead of `<` in many cases. +- Some optimizations, such as using pointer arithmetic with exclusive bounds, may require rethinking. + + +* Conclusion +The choice between exclusive and inclusive bounds is not merely a stylistic one but has real implications for safety, correctness, and performance. While exclusive bounds remain dominant in C-derived languages, inclusive bounds eliminate a class of potential errors and are often preferable when designing new architectures or computational models. + +For TTCA, inclusive bounds were chosen specifically to prevent the issues that arise from exclusive bounds​:contentReference[oaicite:1]{index=1}. Future discussions may explore the feasibility of transitioning software ecosystems toward inclusive bounds or at least providing safer abstractions to mitigate the risks of exclusive bounds. + +* References +- Thomas, *Tom's Turing Complete Computing Architecture (TTCA)*​:contentReference[oaicite:2]{index=2}. +------------------------ +Here’s what stands out from this process: + +Inclusive lower bounds work cleanly + +The starting address is always valid, so there’s no need for adjustments. +This makes iteration straightforward without additional logic. +Inclusive upper bounds require adjustments in word-based operations + +A pointer to the last byte is not a pointer to the last word. +This forces explicit alignment corrections (& ~0x7 style masking). +These corrections introduce additional computation (-8, +1, conditionals). +Exclusive upper bounds simplify bulk memory operations + +If the loop simply runs while ptr < end, the word-aligned processing works naturally. +No need for extra adjustments before entering the bulk copy loop. +This aligns with how hardware and assembly-based operations work. +Hybrid Approach Might Be the Best Path + +Inclusive lower bounds keep indexing intuitive. +Exclusive upper bounds avoid unnecessary adjustments in word-based processing. +This mirrors how C and assembly tend to handle memory intervals (start inclusive, end exclusive).