From: Thomas Walker Lynch Date: Fri, 21 Feb 2025 15:53:42 +0000 (+0000) Subject: adds Copy region and reverse_byte_order X-Git-Url: https://git.reasoningtechnology.com/style/static/gitweb.css?a=commitdiff_plain;h=eb19360d5fa9882b87d6675f1feebf0ea019ce3c;p=N adds Copy region and reverse_byte_order --- diff --git "a/developer/Python\360\237\226\211/template_N.py" "b/developer/Python\360\237\226\211/template_N.py" index 8362083..0c0ef0c 100644 --- "a/developer/Python\360\237\226\211/template_N.py" +++ "b/developer/Python\360\237\226\211/template_N.py" @@ -1,4 +1,3 @@ - from template_conversion import conversion from make_N_constants import make_N_constants @@ -76,18 +75,17 @@ def template_N(): // Return/Error Status and handlers typedef enum{ - NS·Status·ok, - NS·Status·overflow, - NS·Status·accumulator_overflow, - NS·Status·carry, - NS·Status·borrow, - NS·Status·undefined_divide_by_zero, - NS·Status·undefined_modulus_zero, - NS·Status·gt_max_shift_count, - NS·Status·spill_eq_operand, // not currently signaled, result will be spill value - NS·Status·one_word_product, - NS·Status·two_word_product, - NS·Status·ConversionOverflow + NS·Status·ok = 0 + ,NS·Status·overflow + ,NS·Status·accumulator_overflow + ,NS·Status·carry + ,NS·Status·borrow + ,NS·Status·undefined_divide_by_zero + ,NS·Status·undefined_modulus_zero + ,NS·Status·gt_max_shift_count + ,NS·Status·spill_eq_operand // not currently signaled, result will be spill value + ,NS·Status·one_word_product + ,NS·Status·two_word_product } NS·Status; typedef enum{ @@ -96,6 +94,12 @@ def template_N(): ,NS·Order_gt = 1 } NS·Order; + // when alloc runs out of memory + typedef NS·T *( *NS·Allocate_MemoryFault )(Address); + + //---------------------------------------- + // Interface + // Incomplete conversion NS·T -> PNT, NS·T leftovers typedef struct { size_t scale; // this is in bytes @@ -109,30 +113,8 @@ def template_N(): PNT leftover; // Residual value in PNT format\ } NS·Leftover_##PNT; - #ifdef UINT8_MAX - NS·LEFTOVER_PNT(uint8_t) - #endif - #ifdef UINT16_MAX - NS·LEFTOVER_PNT(uint16_t) - #endif - #ifdef UINT32_MAX - NS·LEFTOVER_PNT(uint32_t) - #endif - #ifdef UINT64_MAX - NS·LEFTOVER_PNT(uint64_t) - #endif - #ifdef __UINT128_MAX - NS·LEFTOVER_PNT(__uint128_t) - #endif - - // when alloc runs out of memory - typedef NS·T *( *NS·Allocate_MemoryFault )(Address); - - //---------------------------------------- - // Interface - - #define NS·TO_TYPE(PNT) NS·Status (*to_##PNT)(const NS·T *, PNT *, NS·Leftover_N *) - #define NS·FROM_TYPE(PNT) NS·Status (*from_##PNT)(const PNT *, NS·T * ,NS·Leftover_##PNT *) + #define NS·WRITE(PNT) NS·Status (*write_##PNT)(const NS·T *, PNT *, NS·Leftover_N *) + #define NS·READ(PNT) NS·Status (*read_##PNT)(const PNT *, NS·T * ,NS·Leftover_##PNT *) typedef struct{ @@ -268,6 +250,8 @@ def template_N(): // This part is included after the user's code. If the code at top is a 'header, then this is a 'tailer'. #ifdef LOCAL + #include "Copy.lib.c" + CONSTANTS_BLOCK NS·T *NS·zero = NS·constant + 0; @@ -514,19 +498,14 @@ def template_N(): } #ifdef UINT8_MAX - CONV_8 #endif #ifdef UINT16_MAX - CONV_16 #endif #ifdef UINT32_MAX - CONV_32 #endif #ifdef UINT64_MAX - CONV_64 #endif #ifdef __UINT128_MAX - CONV_128 #endif Local const NS·M NS·m = { @@ -582,9 +561,12 @@ def template_N(): }; - #endif + #undef FACE + #include "Copy.lib.c" -#endif + #endif // LOCAL + +#endif // IMPLEMENTATION ''' diff --git "a/developer/cc\360\237\226\211/Copy.lib.c" "b/developer/cc\360\237\226\211/Copy.lib.c" new file mode 100644 index 0000000..2563ec0 --- /dev/null +++ "b/developer/cc\360\237\226\211/Copy.lib.c" @@ -0,0 +1,152 @@ +/* + Copy - Memory copy operations with attention to alignment. + Provides optimized copy and byte order reversal functions. +*/ + +#define Copy·DEBUG + +#ifndef FACE +#define Copy·IMPLEMENTATION +#define FACE +#endif + +//-------------------------------------------------------------------------------- +// Interface + +#ifndef Copy·FACE +#define Copy·FACE + + #include + #include + + void *Copy·region(void *read0 ,void *read1 ,void *write0); + void *Copy·reverse_byte_order(void *read0 ,void *read1 ,void *write0); + +#endif + +//-------------------------------------------------------------------------------- +// Implementation + +#ifdef Copy·IMPLEMENTATION + + // this part goes into Nlib.a + #ifndef LOCAL + #endif + + #ifdef LOCAL + + /* + Copy·region - Copies a memory region while preserving byte order. + - Aligns reads for performance. + - Writes are assumed to be buffered and do not require alignment. + - Returns the updated write pointer. + */ + void *Copy·region(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read0; + uint8_t *r1 = (uint8_t *)read1; + uint8_t *w = (uint8_t *)write0; + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 ){ + + // ORing in `0x7` adds at most six bytes to r. + uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7); + + // If the read interval is very small + if(r01 >= r1){ + while(r < r1){ + *w++ = *r++; + } + return w; + } + + // Copy up to alignment boundary + do{ + *w++ = *r++; + }while(r <= r01); + } + // r is now aligned, but *r has not yet been copied + + //---------- + // The bulk copy part (w is still possibly unaligned, but r is aligned) + uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7); + + while(r < r10){ + *(uint64_t *)w = *(uint64_t *)r; + w += 8; + r += 8; + } + + // If r1 was aligned then r10 == r1 and we are done + if(r == r1) return w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *r++; + }while(r < r1); + + return w; + } + + /* + Copy·reverse_byte_order - Copies a memory region while reversing byte order. + - Reads in reverse order while writing in forward order. + - Uses `__builtin_bswap64` for efficient 64-bit swaps. + - Returns the updated write pointer. + */ + void *Copy·reverse_byte_order(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read1; // Start from the last byte + uint8_t *r0 = (uint8_t *)read0; + uint8_t *w = (uint8_t *)write0; + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 ){ + + // ANDing with `~0x7` moves it downward to the nearest lower alignment. + uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7); + + // If the read interval is very small + if(r10 < r0){ + while(r > r0){ + *w++ = *--r; + } + return w; + } + + // Copy down to alignment boundary + do{ + *w++ = *--r; + }while(r > r10); + } + // r is now aligned, and *r has been copied + + //---------- + // The bulk copy part + uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7); + + while(r > r01){ + r -= 8; + *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r); + w += 8; + } + + // If r0 was aligned then r01 == r0 and we are done + if(r < r0) return w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *--r; + }while(r >= r0); + + return w; + } + + #endif // LOCAL + +#endif // IMPLEMENTATION diff --git "a/developer/deprecated\360\237\226\211/copy.lib.c" "b/developer/deprecated\360\237\226\211/copy.lib.c" new file mode 100644 index 0000000..58ed27e --- /dev/null +++ "b/developer/deprecated\360\237\226\211/copy.lib.c" @@ -0,0 +1,117 @@ +/* +We pay attention to word alignment on the read side. Writes are probably +buffered so alignment is not as big of a performance issue. +*/ +void *Copy·region(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read0; + uint8_t *r1 = (uint8_t *)read1; + uint8_t *w = (uint8_t *)write0; + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 ){ + + // At this point r is known to be unaligned. + // ORing in `0x7` adds at most six + uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7); + + // then the read interval is very small + if(r01 >= r1){ + while(r < r1){ + *w++ = *r++; + } + return w; + } + + // Copy up to alignment boundary + do{ + *w++ = *r++; + }while(r <= r01); + } + // r is now aligned, but *r has not yet been copied + + + //---------- + // The bulk copy part (w is still possibly unaligned, but r is aligned) + uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7); + + while(r < r10){ + *(uint64_t *)w = *(uint64_t *)r; + w += 8; + r += 8; + } + + // if r1 was aligned then r10 == r1 and we are done + if(r == r1) w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *r++; + }while(r < r1); + + return w; + +} + +void *Copy·reverse_byte_order(void *read0 ,void *read1 ,void *write0){ + + uint8_t *r = (uint8_t *)read1; // Start from the last byte + uint8_t *r0 = (uint8_t *)read0; + uint8_t *w = (uint8_t *)write0; + + // For r to be aligned, means that it points to a 64 bit word, and the byte it + // points was the last copied coming out of the prior loop. This is due to going in + // leftward through the array, and the part to the rightward already having + // been done. + + //---------- + // The potentially unaligned initial part (align read pointer). + if( (uintptr_t)r & 0x7 != 0){ + + // At this point r is known to be unaligned. + // ANDing with `~0x7` moves it **downward** to the nearest lower alignment. + uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7); + + // If the read interval is very small + if(r10 < r0){ + while(r > r0){ + *w++ = *--r; + } + return w; + } + + // Copy down to alignment boundary + do{ + *w++ = *--r; + }while(r > r10); + } + // r is now aligned, and *r has been copied + + //---------- + // The bulk copy part + + // the first aligned address greater than or equal to r0 + uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7); + + // as both r and r01 are aligned, r == r01 upon exit of this loop + while(r > r01){ + r -= 8; + *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r); + w += 8; + } + + // If r0 was aligned then r01 == r0 and we are done + if(r < r0) return w; + + //---------- + // The ragged tail, up to 7 bytes + do{ + *w++ = *--r; + }while(r >= r0); + + return w; +} + + diff --git "a/developer/document\360\237\226\211/todo.org" "b/developer/document\360\237\226\211/todo.org" index c69a1fd..a63c4b6 100644 --- "a/developer/document\360\237\226\211/todo.org" +++ "b/developer/document\360\237\226\211/todo.org" @@ -3,3 +3,10 @@ done * 2025-02-14T16:20:52Z consider adding macros to replace multi-digit sequences with native types when available in some situations. + + +* 2025-02-21T09:43:26Z Copy.lib.c should be templated with the block copy type, currently uint64_t templated, and the alginment mask templated. Also need macro gates to + turn off expansion when there is no uint64_t support. Also need a naive byte + copy version that works when there is not even a unint32_t type. + +