-
from template_conversion import conversion
from make_N_constants import make_N_constants
// Return/Error Status and handlers
typedef enum{
- NS路Status路ok,
- NS路Status路overflow,
- NS路Status路accumulator_overflow,
- NS路Status路carry,
- NS路Status路borrow,
- NS路Status路undefined_divide_by_zero,
- NS路Status路undefined_modulus_zero,
- NS路Status路gt_max_shift_count,
- NS路Status路spill_eq_operand, // not currently signaled, result will be spill value
- NS路Status路one_word_product,
- NS路Status路two_word_product,
- NS路Status路ConversionOverflow
+ NS路Status路ok = 0
+ ,NS路Status路overflow
+ ,NS路Status路accumulator_overflow
+ ,NS路Status路carry
+ ,NS路Status路borrow
+ ,NS路Status路undefined_divide_by_zero
+ ,NS路Status路undefined_modulus_zero
+ ,NS路Status路gt_max_shift_count
+ ,NS路Status路spill_eq_operand // not currently signaled, result will be spill value
+ ,NS路Status路one_word_product
+ ,NS路Status路two_word_product
} NS路Status;
typedef enum{
,NS路Order_gt = 1
} NS路Order;
+ // when alloc runs out of memory
+ typedef NS路T *( *NS路Allocate_MemoryFault )(Address);
+
+ //----------------------------------------
+ // Interface
+
// Incomplete conversion NS路T -> PNT, NS路T leftovers
typedef struct {
size_t scale; // this is in bytes
PNT leftover; // Residual value in PNT format\
} NS路Leftover_##PNT;
- #ifdef UINT8_MAX
- NS路LEFTOVER_PNT(uint8_t)
- #endif
- #ifdef UINT16_MAX
- NS路LEFTOVER_PNT(uint16_t)
- #endif
- #ifdef UINT32_MAX
- NS路LEFTOVER_PNT(uint32_t)
- #endif
- #ifdef UINT64_MAX
- NS路LEFTOVER_PNT(uint64_t)
- #endif
- #ifdef __UINT128_MAX
- NS路LEFTOVER_PNT(__uint128_t)
- #endif
-
- // when alloc runs out of memory
- typedef NS路T *( *NS路Allocate_MemoryFault )(Address);
-
- //----------------------------------------
- // Interface
-
- #define NS路TO_TYPE(PNT) NS路Status (*to_##PNT)(const NS路T *, PNT *, NS路Leftover_N *)
- #define NS路FROM_TYPE(PNT) NS路Status (*from_##PNT)(const PNT *, NS路T * ,NS路Leftover_##PNT *)
+ #define NS路WRITE(PNT) NS路Status (*write_##PNT)(const NS路T *, PNT *, NS路Leftover_N *)
+ #define NS路READ(PNT) NS路Status (*read_##PNT)(const PNT *, NS路T * ,NS路Leftover_##PNT *)
typedef struct{
// This part is included after the user's code. If the code at top is a 'header, then this is a 'tailer'.
#ifdef LOCAL
+ #include "Copy.lib.c"
+
CONSTANTS_BLOCK
NS路T *NS路zero = NS路constant + 0;
}
#ifdef UINT8_MAX
- CONV_8
#endif
#ifdef UINT16_MAX
- CONV_16
#endif
#ifdef UINT32_MAX
- CONV_32
#endif
#ifdef UINT64_MAX
- CONV_64
#endif
#ifdef __UINT128_MAX
- CONV_128
#endif
Local const NS路M NS路m = {
};
- #endif
+ #undef FACE
+ #include "Copy.lib.c"
-#endif
+ #endif // LOCAL
+
+#endif // IMPLEMENTATION
'''
--- /dev/null
+/*
+ Copy - Memory copy operations with attention to alignment.
+ Provides optimized copy and byte order reversal functions.
+*/
+
+#define Copy路DEBUG
+
+#ifndef FACE
+#define Copy路IMPLEMENTATION
+#define FACE
+#endif
+
+//--------------------------------------------------------------------------------
+// Interface
+
+#ifndef Copy路FACE
+#define Copy路FACE
+
+ #include <stdint.h>
+ #include <stddef.h>
+
+ void *Copy路region(void *read0 ,void *read1 ,void *write0);
+ void *Copy路reverse_byte_order(void *read0 ,void *read1 ,void *write0);
+
+#endif
+
+//--------------------------------------------------------------------------------
+// Implementation
+
+#ifdef Copy路IMPLEMENTATION
+
+ // this part goes into Nlib.a
+ #ifndef LOCAL
+ #endif
+
+ #ifdef LOCAL
+
+ /*
+ Copy路region - Copies a memory region while preserving byte order.
+ - Aligns reads for performance.
+ - Writes are assumed to be buffered and do not require alignment.
+ - Returns the updated write pointer.
+ */
+ void *Copy路region(void *read0 ,void *read1 ,void *write0){
+
+ uint8_t *r = (uint8_t *)read0;
+ uint8_t *r1 = (uint8_t *)read1;
+ uint8_t *w = (uint8_t *)write0;
+
+ //----------
+ // The potentially unaligned initial part (align read pointer).
+ if( (uintptr_t)r & 0x7 ){
+
+ // ORing in `0x7` adds at most six bytes to r.
+ uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7);
+
+ // If the read interval is very small
+ if(r01 >= r1){
+ while(r < r1){
+ *w++ = *r++;
+ }
+ return w;
+ }
+
+ // Copy up to alignment boundary
+ do{
+ *w++ = *r++;
+ }while(r <= r01);
+ }
+ // r is now aligned, but *r has not yet been copied
+
+ //----------
+ // The bulk copy part (w is still possibly unaligned, but r is aligned)
+ uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7);
+
+ while(r < r10){
+ *(uint64_t *)w = *(uint64_t *)r;
+ w += 8;
+ r += 8;
+ }
+
+ // If r1 was aligned then r10 == r1 and we are done
+ if(r == r1) return w;
+
+ //----------
+ // The ragged tail, up to 7 bytes
+ do{
+ *w++ = *r++;
+ }while(r < r1);
+
+ return w;
+ }
+
+ /*
+ Copy路reverse_byte_order - Copies a memory region while reversing byte order.
+ - Reads in reverse order while writing in forward order.
+ - Uses `__builtin_bswap64` for efficient 64-bit swaps.
+ - Returns the updated write pointer.
+ */
+ void *Copy路reverse_byte_order(void *read0 ,void *read1 ,void *write0){
+
+ uint8_t *r = (uint8_t *)read1; // Start from the last byte
+ uint8_t *r0 = (uint8_t *)read0;
+ uint8_t *w = (uint8_t *)write0;
+
+ //----------
+ // The potentially unaligned initial part (align read pointer).
+ if( (uintptr_t)r & 0x7 ){
+
+ // ANDing with `~0x7` moves it downward to the nearest lower alignment.
+ uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7);
+
+ // If the read interval is very small
+ if(r10 < r0){
+ while(r > r0){
+ *w++ = *--r;
+ }
+ return w;
+ }
+
+ // Copy down to alignment boundary
+ do{
+ *w++ = *--r;
+ }while(r > r10);
+ }
+ // r is now aligned, and *r has been copied
+
+ //----------
+ // The bulk copy part
+ uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7);
+
+ while(r > r01){
+ r -= 8;
+ *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r);
+ w += 8;
+ }
+
+ // If r0 was aligned then r01 == r0 and we are done
+ if(r < r0) return w;
+
+ //----------
+ // The ragged tail, up to 7 bytes
+ do{
+ *w++ = *--r;
+ }while(r >= r0);
+
+ return w;
+ }
+
+ #endif // LOCAL
+
+#endif // IMPLEMENTATION
--- /dev/null
+/*
+We pay attention to word alignment on the read side. Writes are probably
+buffered so alignment is not as big of a performance issue.
+*/
+void *Copy路region(void *read0 ,void *read1 ,void *write0){
+
+ uint8_t *r = (uint8_t *)read0;
+ uint8_t *r1 = (uint8_t *)read1;
+ uint8_t *w = (uint8_t *)write0;
+
+ //----------
+ // The potentially unaligned initial part (align read pointer).
+ if( (uintptr_t)r & 0x7 ){
+
+ // At this point r is known to be unaligned.
+ // ORing in `0x7` adds at most six
+ uint8_t *r01 = (uint8_t *)((uintptr_t)r | 0x7);
+
+ // then the read interval is very small
+ if(r01 >= r1){
+ while(r < r1){
+ *w++ = *r++;
+ }
+ return w;
+ }
+
+ // Copy up to alignment boundary
+ do{
+ *w++ = *r++;
+ }while(r <= r01);
+ }
+ // r is now aligned, but *r has not yet been copied
+
+
+ //----------
+ // The bulk copy part (w is still possibly unaligned, but r is aligned)
+ uint8_t *r10 = (uint8_t *)((uintptr_t)r1 & ~(uintptr_t)0x7);
+
+ while(r < r10){
+ *(uint64_t *)w = *(uint64_t *)r;
+ w += 8;
+ r += 8;
+ }
+
+ // if r1 was aligned then r10 == r1 and we are done
+ if(r == r1) w;
+
+ //----------
+ // The ragged tail, up to 7 bytes
+ do{
+ *w++ = *r++;
+ }while(r < r1);
+
+ return w;
+
+}
+
+void *Copy路reverse_byte_order(void *read0 ,void *read1 ,void *write0){
+
+ uint8_t *r = (uint8_t *)read1; // Start from the last byte
+ uint8_t *r0 = (uint8_t *)read0;
+ uint8_t *w = (uint8_t *)write0;
+
+ // For r to be aligned, means that it points to a 64 bit word, and the byte it
+ // points was the last copied coming out of the prior loop. This is due to going in
+ // leftward through the array, and the part to the rightward already having
+ // been done.
+
+ //----------
+ // The potentially unaligned initial part (align read pointer).
+ if( (uintptr_t)r & 0x7 != 0){
+
+ // At this point r is known to be unaligned.
+ // ANDing with `~0x7` moves it **downward** to the nearest lower alignment.
+ uint8_t *r10 = (uint8_t *)((uintptr_t)r & ~(uintptr_t)0x7);
+
+ // If the read interval is very small
+ if(r10 < r0){
+ while(r > r0){
+ *w++ = *--r;
+ }
+ return w;
+ }
+
+ // Copy down to alignment boundary
+ do{
+ *w++ = *--r;
+ }while(r > r10);
+ }
+ // r is now aligned, and *r has been copied
+
+ //----------
+ // The bulk copy part
+
+ // the first aligned address greater than or equal to r0
+ uint8_t *r01 = (uint8_t *)( ((uintptr_t)r0 + (uintptr_t)0x7) & ~(uintptr_t)0x7);
+
+ // as both r and r01 are aligned, r == r01 upon exit of this loop
+ while(r > r01){
+ r -= 8;
+ *(uint64_t *)w = __builtin_bswap64(*(uint64_t *)r);
+ w += 8;
+ }
+
+ // If r0 was aligned then r01 == r0 and we are done
+ if(r < r0) return w;
+
+ //----------
+ // The ragged tail, up to 7 bytes
+ do{
+ *w++ = *--r;
+ }while(r >= r0);
+
+ return w;
+}
+
+
done
* 2025-02-14T16:20:52Z consider adding macros to replace multi-digit sequences with native types when available in some situations.
+
+
+* 2025-02-21T09:43:26Z Copy.lib.c should be templated with the block copy type, currently uint64_t templated, and the alginment mask templated. Also need macro gates to
+ turn off expansion when there is no uint64_t support. Also need a naive byte
+ copy version that works when there is not even a unint32_t type.
+
+