diff --git a/lonetix/smallbytecopy.h b/lonetix/smallbytecopy.h new file mode 100644 index 0000000..5a51ef4 --- /dev/null +++ b/lonetix/smallbytecopy.h @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later + +/** + * \file smallbytecopy.h + * + * Optimized routines for tiny buffer copy (<= 64 bytes). + * + * Whenever possible, avoid these routines and use regular + * memcpy()/memmove(). + * These routines are recommended only when: + * - you are absolutely sure of the maximum size of your data. + * - the compiler cannot possibly estimate it statically, + * otherwise the compiler could do a much better job at + * optimizing the copy. + * + * \copyright The DoubleFourteen Code Forge (C) All Rights Reserved + * \author Lorenzo Cogotti + */ + +#ifndef DF_SMALLBYTECOPY_H_ +#define DF_SMALLBYTECOPY_H_ + +#include "xpt.h" + +#if defined(__i386__) || defined(__x86_64__) +#include + +// Optimize copy and don't pay attention to alignment, +// ugly but fast and inline-able compared to plain memcpy()... + +#define _bytecopy1(d, s) ((void) (*(Uint8 *) (d) = *(Uint8 *) (s))) +#define _bytecopy2(d, s) ((void) (*(Uint16 *) (d) = *(Uint16 *) (s))) +#define _bytecopy4(d, s) ((void) (*(Uint32 *) (d) = *(Uint32 *) (s))) +#define _bytecopy8(d, s) ((void) (*(Uint64 *) (d) = *(Uint64 *) (s))) + +INLINE void _smallbytecopy4(void *__restrict dest, const void *__restrict src, size_t n) +{ + Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src; + + assert(n <= 4); + + switch (n) { + case 4: _bytecopy4(d, s); + break; + case 3: _bytecopy2(d + 1, s + 1); + /*FALLTHROUGH*/ + case 1: _bytecopy1(d, s); + break; + case 2: _bytecopy2(d, s); + break; + case 0: break; + default: UNREACHABLE; break; + } +} + +INLINE void _smallbytecopy8(void *__restrict dest, const void *__restrict src, size_t n) +{ + Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src; + + assert(n <= 8); + + switch (n) { + case 8: _bytecopy8(d, s); + break; + case 7: _bytecopy4(d + 3, s + 3); + /*FALLTHROUGH*/ + case 3: _bytecopy2(d + 1, s + 1); + /*FALLTHROUGH*/ + case 1: _bytecopy1(d, s); + break; + case 6: _bytecopy4(d + 2, s + 2); + /*FALLTHROUGH*/ + case 2: _bytecopy2(d, s); + break; + case 5: _bytecopy1(d + 4, s + 4); + /*FALLTHROUGH*/ + case 4: _bytecopy4(d, s); + break; + case 0: break; + default: UNREACHABLE; break; + } +} + +INLINE void _smallbytecopy16(void *__restrict dest, const void *__restrict src, size_t n) +{ + Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src; + + assert(n <= 16); + + if (n > 8) { + _bytecopy8(d, s); + d += 8, s += 8, n -= 8; + } + _smallbytecopy8(d, s, n); +} + +INLINE void _smallbytecopy32(void *__restrict dest, const void *__restrict src, size_t n) +{ + Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src; + + assert(n <= 32); + + if (n > 16) { + _bytecopy8(d, s); + _bytecopy8(d + 8, s + 8); + d += 16, s += 16, n -= 16; + } + _smallbytecopy16(d, s, n); +} + +INLINE void _smallbytecopy64(void *__restrict dest, const void *__restrict src, size_t n) +{ + Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src; + + assert(n <= 64); + + if (n > 32) { + _bytecopy8(d, s); + _bytecopy8(d + 8, s + 8); + _bytecopy8(d + 16, s + 16); + _bytecopy8(d + 24, s + 24); + d += 32, s += 32, n -= 32; + } + _smallbytecopy32(d, s, n); +} + +#undef _bytecopy1 +#undef _bytecopy2 +#undef _bytecopy4 +#undef _bytecopy8 + +#else +#include + +#define _smallbytecopy4(d, s, n) ((void) memcpy(d, s, n)) +#define _smallbytecopy8(d, s, n) ((void) memcpy(d, s, n)) +#define _smallbytecopy16(d, s, n) ((void) memcpy(d, s, n)) +#define _smallbytecopy32(d, s, n) ((void) memcpy(d, s, n)) +#define _smallbytecopy64(d, s, n) ((void) memcpy(d, s, n)) + +#endif + +#endif