From 66303e9ba25d5f9dbf8806ddfe0f148680bc0e35 Mon Sep 17 00:00:00 2001 From: Olaf Barthel Date: Mon, 13 Nov 2006 09:49:49 +0000 Subject: [PATCH] - Added a directory to hold contributed code which has not been integrated with the library yet. - The byteswap code was contributed by Peter Bengtsson. Thank you very much! git-svn-id: file:///Users/olsen/Code/migration-svn-zu-git/logical-line-staging/clib2/trunk@15163 87f5fb63-7c3d-0410-a384-fd976d0f7a62 --- library/contrib/README | 3 + library/contrib/byteswap/byteswap.h | 183 ++++++++++++++++++++ library/contrib/byteswap/byteswap_bswap16.c | 28 +++ library/contrib/byteswap/byteswap_bswap24.c | 29 ++++ library/contrib/byteswap/byteswap_bswap32.c | 31 ++++ library/contrib/byteswap/byteswap_bswap64.c | 48 +++++ library/contrib/byteswap/byteswap_swab.c | 70 ++++++++ library/contrib/byteswap/byteswap_swab24.c | 91 ++++++++++ library/contrib/byteswap/byteswap_swab32.c | 112 ++++++++++++ library/contrib/byteswap/byteswap_swab64.c | 101 +++++++++++ 10 files changed, 696 insertions(+) create mode 100644 library/contrib/README create mode 100644 library/contrib/byteswap/byteswap.h create mode 100644 library/contrib/byteswap/byteswap_bswap16.c create mode 100644 library/contrib/byteswap/byteswap_bswap24.c create mode 100644 library/contrib/byteswap/byteswap_bswap32.c create mode 100644 library/contrib/byteswap/byteswap_bswap64.c create mode 100644 library/contrib/byteswap/byteswap_swab.c create mode 100644 library/contrib/byteswap/byteswap_swab24.c create mode 100644 library/contrib/byteswap/byteswap_swab32.c create mode 100644 library/contrib/byteswap/byteswap_swab64.c diff --git a/library/contrib/README b/library/contrib/README new file mode 100644 index 0000000..811c04f --- /dev/null +++ b/library/contrib/README @@ -0,0 +1,3 @@ +This directory contains contributions which have not yet been integrated +with the clib2 library build but which should be in the CVS repository +both for safekeeping and for you to look at and adapt. diff --git a/library/contrib/byteswap/byteswap.h b/library/contrib/byteswap/byteswap.h new file mode 100644 index 0000000..4664bf5 --- /dev/null +++ b/library/contrib/byteswap/byteswap.h @@ -0,0 +1,183 @@ + +#ifndef __BYTESWAP_H +#define __BYTESWAP_H + +#include +#include + +#if defined(__GNUC__) +#define __CONST_FUNC __attribute__((const)) +#else +#define __CONST_FUNC /* Nothing */ +#endif + +/* Single value byteswap functions. */ + +extern __CONST_FUNC uint16_t bswap16(uint16_t); +extern __CONST_FUNC uint32_t bswap24(uint32_t); +extern __CONST_FUNC uint32_t bswap32(uint32_t); + +#ifdef INT64_MIN +extern __CONST_FUNC uint64_t bswap64(uint64_t); +#endif + +/* Block byteswap functions. The swab() function usually resides in unistd.h, so perhaps it should be moved there? */ +/* NOTE: Contrary to the standard swab(), this version returns the "to" pointer and the pointers are not restrict + * qualified - so swapping buffer-contents in-place is supported. + * Also, swab24(), swab32() and swab64() are non-standard functions. + */ + +extern void *swab(void *from,void *to,ssize_t nbytes); +extern void *swab24(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 24-bit words instead. */ +extern void *swab32(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 32-bit words instead. */ +extern void *swab64(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 64-bit words instead. */ + +#define swab16(x) swab(x) + +/* + * Optimized inline-versions for the single-value functions follow. + * Only GCC+PPC and GCC+m68k support for now. + */ + +#if defined(__GNUC__) + +/* Select implementation. */ + +#define bswap16(x) (__builtin_constant_p(x))?__const_swap16(x):__swap16(x) +#define bswap24(x) (__builtin_constant_p(x))?__const_swap24(x):__swap24(x) +#define bswap32(x) (__builtin_constant_p(x))?__const_swap32(x):__swap32(x) +#define bswap64(x) (__builtin_constant_p(x))?__const_swap64(x):__swap64(x) + +/* Assembler implementations */ + +#if defined(__PPC__) + +static __inline__ __CONST_FUNC uint16_t __swap16(uint16_t u16) { + uint_fast16_t result; + __asm__("\ + rlwinm %[result],%[u16],8,16,24\n\ + rlwimi %[result],%[u16],24,24,31\n\ + ":[result]"=&r"(result):[u16]"r"(u16)); + return(result); +} + +static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) { + uint_fast32_t result; + __asm__("\ + rlwinm %[result],%[u32],16,8,31\n\ + rlwimi %[result],%[u32],0,16,24\n\ + ":[result]"=&r"(result):[u32]"r"(u32)); + return(result); +} + +static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) { + uint_fast32_t result; + __asm__("\ + rlwinm %[result],%[u32],8,8,31\n\ + rlwimi %[result],%[u32],24,0,7\n\ + rlwimi %[result],%[u32],24,16,23\n\ + ":[result]"=&r"(result):[u32]"r"(u32)); + return(result); +} + +/* + * Note: __swap64() might perhaps be optimized a bit more by scheduling the + * instructions to alternate register-use, but this instead means there + * are two less registers free since "u64" and "result" may no longer overlap. + * Decisions, decisions.... + */ + +static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) { + uint_fast64_t result; + uint_fast32_t tmp; + __asm__("\ + rlwinm %[tmp],%[u64],8,8,31\n\ + rlwimi %[tmp],%[u64],24,0,7\n\ + rlwimi %[tmp],%[u64],24,16,23\n\ + rlwinm %[result],%L[u64],8,8,31\n\ + rlwimi %[result],%L[u64],24,0,7\n\ + rlwimi %[result],%L[u64],24,16,23\n\ + or %L[result],%[tmp],%[tmp]\n\ + ":[result]"=r"(result),[tmp]"=&r"(tmp):[u64]"r"(u64)); + return(result); +} + +#elif defined(__mc68020__) + +static __inline__ __CONST_FUNC uint16_t __swap16(uint16_t u16) { + __asm__("\ + rol.w #8,%[u16]\n\ + ":[u16]"+d"(u16)::"cc"); + return(u16); +} + +static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) { + __asm__("\ + rol.w #8,%[u32]\n\ + swap %[u32]\n\ + rol.w #8,%[u32]\n\ + ror.l #8,%[u32]\n\ + ":[u32]"+d"(u32)::"cc"); + return(u32); +} + +static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) { + __asm__("\ + rol.w #8,%[u32]\n\ + swap %[u32]\n\ + rol.w #8,%[u32]\n\ + ":[u32]"+d"(u32)::"cc"); + return(u32); +} + +static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) { + __asm__("\ + rol.w #8,%[u64]\n\ + rol.w #8,%L[u64]\n\ + swap %[u64]\n\ + swap %L[u64]\n\ + rol.w #8,%[u64]\n\ + rol.w #8,%L[u64]\n\ + exg %[u64],L%[u64]\n\ + ":[u64]"+d"(u64)::"cc"); + return(u64); +} + +#else +/* Unknown or undefined architecture. Perhaps compiling with "-strict -ansi", but should not use this header then anyway. */ +#undef bswap16 +#undef bswap24 +#undef bswap32 +#undef bswap64 +#define bswap16(x) (__builtin_constant_p(x))?__const_swap16(x):bswap16(x) +#define bswap24(x) (__builtin_constant_p(x))?__const_swap24(x):bswap24(x) +#define bswap32(x) (__builtin_constant_p(x))?__const_swap32(x):bswap32(x) +#define bswap64(x) (__builtin_constant_p(x))?__const_swap64(x):bswap64(x) +#endif + +/* C implementations for constant values */ + +static __inline__ uint16_t __const_swap16(uint16_t u16) { + return(u16>>8|u16<<8); +} + +static __inline__ uint32_t __const_swap24(uint32_t u32) { + return(((u32&0xff)<<16)|((u32&0xff00))|((u32&0xff0000)>>16)); +} + +static __inline__ uint32_t __const_swap32(uint32_t u32) { + return(((u32&0xff)<<24)|((u32&0xff00)<<8)|((u32&0xff0000)>>8)|((u32&0xff000000)>>24)); +} + +static __inline__ uint64_t __const_swap64(uint64_t u64) { + return(((u64&0xffLL)<<56)|((u64&0xff00LL)<<40)|((u64&0xff0000LL)<<24)|((u64&0xff000000LL)<<8)| + ((u64&0xff00000000LL)>>8)|((u64&0xff0000000000LL)>>24)|((u64&0xff000000000000LL)>>40)|((u64&0xff00000000000000LL)>>56)); +} + +#endif /* __GNUC__ */ + + +#endif /* __BYTESWAP_H */ + +/* vi:set ts=3: */ + diff --git a/library/contrib/byteswap/byteswap_bswap16.c b/library/contrib/byteswap/byteswap_bswap16.c new file mode 100644 index 0000000..04ac869 --- /dev/null +++ b/library/contrib/byteswap/byteswap_bswap16.c @@ -0,0 +1,28 @@ + +#if defined(__PPC__) && defined(__GNUC__) + +asm("\ + .text\n\ + .align 2\n\ + .globl bswap16\n\ + .type bswap16, @function\n\ +bswap16:\n\ +# rlwinm %r4,%r3,8,16,24\n\ +# rlwimi %r4,%r3,24,24,31\n\ +# or %r3,%r4,%r4\n\ + rlwimi %r3,%r3,16,8,15\n\ + srwi %r3,%r3,8\n\ + blr\n\ +"); + +#else + +#include + +uint16_t bswap16(uint16_t u16) +{ +return(u16>>8|u16<<8); +} + +#endif + diff --git a/library/contrib/byteswap/byteswap_bswap24.c b/library/contrib/byteswap/byteswap_bswap24.c new file mode 100644 index 0000000..7474c3d --- /dev/null +++ b/library/contrib/byteswap/byteswap_bswap24.c @@ -0,0 +1,29 @@ + +#if defined(__PPC__) && defined(__GNUC__) + +asm(" .text\n\ + .align 2\n\ + .globl bswap24\n\ + .type bswap24, @function\n\ +bswap32:\n\ + rlwinm %r4,%r3,16,8,31\n\ + rlwimi %r4,%r3,0,16,24\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); + +#else + +#include + +uint32_t bswap24(uint32_t u32) +{ + return( + ((u32&0xff)<<16)| + ((u32&0xff00))| + ((u32&0xff0000)>>16) + ); +} + +#endif + diff --git a/library/contrib/byteswap/byteswap_bswap32.c b/library/contrib/byteswap/byteswap_bswap32.c new file mode 100644 index 0000000..f12b6f9 --- /dev/null +++ b/library/contrib/byteswap/byteswap_bswap32.c @@ -0,0 +1,31 @@ + +#if defined(__PPC__) && defined(__GNUC__) + +asm(" .text\n\ + .align 2\n\ + .globl bswap32\n\ + .type bswap32, @function\n\ +bswap32:\n\ + rlwinm %r4,%r3,8,8,31\n\ + rlwimi %r4,%r3,24,0,7\n\ + rlwimi %r4,%r3,24,16,23\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); + +#else + +#include + +uint32_t bswap32(uint32_t u32) +{ + return( + ((u32&0xff)<<24)| + ((u32&0xff00)<<8)| + ((u32&0xff0000)>>8)| + ((u32&0xff000000)>>24) + ); +} + +#endif + diff --git a/library/contrib/byteswap/byteswap_bswap64.c b/library/contrib/byteswap/byteswap_bswap64.c new file mode 100644 index 0000000..233393d --- /dev/null +++ b/library/contrib/byteswap/byteswap_bswap64.c @@ -0,0 +1,48 @@ + +#if defined(USE_64_BIT_INTS) + +#if defined(__PPC__) && defined(__GNUC__) + +asm(" .text\n\ + .align 2\n\ + .globl bswap64\n\ + .type bswap64, @function\n\ +bswap64:\n\ + rlwinm %r5,%r3,8,8,31\n\ + rlwimi %r5,%r3,24,0,7\n\ + rlwimi %r5,%r3,24,16,23\n\ + rlwinm %r3,%r4,8,8,31\n\ + rlwimi %r3,%r4,24,0,7\n\ + rlwimi %r3,%r4,24,16,23\n\ + or %r4,%r5,%r5\n\ + blr\n\ +"); + +#else + +#include + +uint64_t bswap64(uint64_t u64) +{ +union { + uint64_t ll; + uint32_t l[2]; +} v={.ll=u64}; +uint32_t tmp; +tmp=v.l[0]; +v.l[0]=((v.l[1]&0xff)<<24)| + ((v.l[1]&0xff00)<<8)| + ((v.l[1]&0xff0000)>>8)| + ((v.l[1]&0xff000000)>>24); +v.l[1]=((tmp&0xff)<<24)| + ((tmp&0xff00)<<8)| + ((tmp&0xff0000)>>8)| + ((tmp&0xff000000)>>24); +return(v.ll); +} + +#endif + +#endif + + diff --git a/library/contrib/byteswap/byteswap_swab.c b/library/contrib/byteswap/byteswap_swab.c new file mode 100644 index 0000000..21d0691 --- /dev/null +++ b/library/contrib/byteswap/byteswap_swab.c @@ -0,0 +1,70 @@ + +#if defined(__GNUC__) && defined(__PPC__) + +/* r3=from, r4=to, r5=len/count, r6=index, r7=load/store/temp */ + +asm("\ + .text\n\ + .align 2\n\ + .globl swab\n\ + .type swab,@function\n\ +swab:\n\ + dcbt 0,%r3\n\ + srawi. %r5,%r5,1\n\ + bc 4,gt,.exit\n\ + andi. %r7,%r3,3 # Check if we start on an address evenly divisible by 4.\n\ + li %r6,0\n\ + bc 4,gt,.preploop\n\ + lhbrx %r7,%r6,%r3 # Fix alignment if needed.\n\ + sthx %r7,%r6,%r4\n\ + addi %r6,%r6,2\n\ + subi %r5,%r5,1\n\ +.preploop:\n\ + andi. %r7,%r5,1 # Check if even or odd number of 16-bit words.\n\ + srawi %r5,%r5,1 # Number of 32-bit words to half-swap.\n\ + mtctr %r5\n\ + bc 12,gt,.oddloop # Jump to loop for odd number of 16-bit words.\n\ +.loop: # Loop is 'unrolled' by reading/writing 32-bit words.\n\ + lwbrx %r7,%r6,%r3\n\ + rotlwi %r7,%r7,16\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + bc 0,lt,.loop\n\ +.exit:\n\ + or %r3,%r4,%r4\n\ + blr\n\ +.oddloop:\n\ + lwbrx %r7,%r6,%r3\n\ + rotlwi %r7,%r7,16\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + bc 0,lt,.oddloop\n\ + sub %r6,%r6,2\n\ + lhbrx %r7,%r6,%r3 # Fix last 16-bit word.\n\ + sthx %r7,%r6,%r4\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); + +#else + +#include +#include + +void *swab(void *from,void *to,ssize_t len) +{ +int i; +uint16_t u16,*u16in=from,*u16out=to; + +for(i=0;i<(len>>1);i++) { + u16=u16in[i]; + u16out[i]=u16>>8|u16<<8; +} + +return(u16out); +} + +#endif + + + diff --git a/library/contrib/byteswap/byteswap_swab24.c b/library/contrib/byteswap/byteswap_swab24.c new file mode 100644 index 0000000..16eda26 --- /dev/null +++ b/library/contrib/byteswap/byteswap_swab24.c @@ -0,0 +1,91 @@ + +#if defined(__GNUC__) && defined(__PPC__) + +/* r3=from, r4=to, r5=len/remaining, r6/r7=index & r7=temp, r8/r9/r10=read/write temp */ + +asm("\ + .text\n\ + .align 2\n\ + .globl swab24\n\ + .type swab24,@function\n\ +swab24:\n\ + dcbt 0,%r3\n\ + li %r7,3\n\ + divwu %r5,%r5,%r7\n\ + andi. %r7,%r5,3\n\ + srawi. %r5,%r5,2\n\ + mtctr %r5\n\ + or %r5,%r7,%r7\n\ + li %r6,0\n\ + bc 4,gt,.postfix\n\ +.loop:\n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwzx %r9,%r7,%r3\n\ + addi %r7,%r6,8\n\ + lwbrx %r10,%r7,%r3\n\ + rotlwi %r8,%r8,8\n\ + or %r7,%r9,%r9\n\ + rlwimi %r9,%r8,16,8,15\n\ + rlwimi %r9,%r10,8,16,23\n\ + rlwimi %r8,%r7,16,24,31\n\ + rotrwi %r10,%r10,8\n\ + rlwimi %r10,%r7,16,0,7\n\ + stwx %r8,%r6,%r4\n\ + addi %r6,%r6,4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,4\n\ + stwx %r10,%r6,%r4\n\ + addi %r6,%r6,4\n\ + bc 0,lt,.loop\n\ +.postfix: # Fix any remaining 24-bit words (number of remaining words in r5).\n\ + or. %r5,%r5,%r5\n\ + bc 4,gt,.exit\n\ + mtctr %r5\n\ + add %r3,%r3,%r6\n\ + add %r6,%r4,%r6\n\ + subi %r3,%r3,1\n\ +.fixloop:\n\ + lbzu %r7,1(%r3)\n\ + lbzu %r8,1(%r3)\n\ + lbzu %r9,1(%r3)\n\ + stb %r7,2(%r6)\n\ + stb %r8,1(%r6)\n\ + stb %r9,0(%r6)\n\ + addi %r6,%r6,3\n\ + bc 0,lt,.fixloop\n\ +.exit:\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); + +#else + +#include +#include + +/* + * Ugh, this is really very, very ineffiecient. + * (But simple, understandable and safe) + */ + +void *swab24(void *from,void *to,ssize_t len) +{ +uint8_t *src=from,B0,B1,B2,*dst=to; +int i; + +for(i=0;i# of 32-bit words\n\ + andi. %r7,%r5,7\n\ + li %r6,0\n\ + bc 4,gt,.preploop\n\ + mtctr %r7\n\ +.pre: # One 32-bit word at a time until we have (nLeft%8)==0 \n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + bc 0,lt,.pre\n\ +.preploop:\n\ + srawi. %r5,%r5,3 # Divide by 8 again to get number of loops.\n\ + addi %r8,%r8,32 # Start address for next loop (from r3).\n\ + bc 4,gt,.exit\n\ + mtctr %r5\n\ +.loop: # Loop unrolled 8 times = 32 bytes = 1 cache-line (except on the 970).\n\ + dcbt %r8,%r3 # Cache hint (prefetch) for the next loop\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + lwbrx %r7,%r6,%r3\n\ + stwx %r7,%r6,%r4\n\ + addi %r6,%r6,4\n\ + addi %r8,%r8,32 # Update cache-hint offset\n\ + bc 0,lt,.loop\n\ +.exit:\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); +*/ + +#else + +#include +#include + +void *swab32(void *from,void *to,ssize_t len) +{ +int i; +uint32_t *u32in=from,*u32out=to,tmp; + +for(i=0;i<(len>>2);i++) { + tmp=u32in[i]; + u32out[i]=((tmp&0xff)<<24)| + ((tmp&0xff00)<<8)| + ((tmp&0xff0000)>>8)| + ((tmp&0xff000000)>>24); +} + +return(to); +} + +#endif + + diff --git a/library/contrib/byteswap/byteswap_swab64.c b/library/contrib/byteswap/byteswap_swab64.c new file mode 100644 index 0000000..acaacdd --- /dev/null +++ b/library/contrib/byteswap/byteswap_swab64.c @@ -0,0 +1,101 @@ + +#if defined(__GNUC__) && defined(__PPC__) + +/* r3=from, r4=to, r5=len/temp, r6/r7=index, r8/r9=load/store temp, r10=cache hint */ + +/* This version is unrolled and uses cache-hinting. It appears to gain about 10% + * over a non-unrolled, non-hinting version. + */ + +asm("\ + .text\n\ + .align 2\n\ + .globl swab64\n\ + .type swab64,@function\n\ +swab64:\n\ + dcbt 0,%r3\n\ + andi. %r10,%r5,31 # The number of bytes handled in '.pre'. Used for prefetch hint.\n\ + srawi %r5,%r5,3 # Convert bytes-># of 64-bit words\n\ + andi. %r7,%r5,3\n\ + li %r6,0\n\ + bc 4,gt,.preploop\n\ + mtctr %r7\n\ +.pre: # One 64-bit word at a time until we have (nLeft%4)==0 \n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwbrx %r9,%r7,%r3\n\ + stwx %r8,%r7,%r4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,8\n\ + bc 0,lt,.pre\n\ +.preploop:\n\ + srawi. %r5,%r5,2 # Divide by 4 again to get number of loops.\n\ + addi %r10,%r10,32 # Start address for next loop.\n\ + bc 4,gt,.exit\n\ + mtctr %r5\n\ +.loop: # Loop unrolled 4 times = 32 bytes = 1 cache-line (except on the 970).\n\ + dcbt %r10,%r3 # Cache hint (prefetch) for the next iteration\n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwbrx %r9,%r7,%r3\n\ + stwx %r8,%r7,%r4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,8\n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwbrx %r9,%r7,%r3\n\ + stwx %r8,%r7,%r4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,8\n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwbrx %r9,%r7,%r3\n\ + stwx %r8,%r7,%r4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,8\n\ + lwbrx %r8,%r6,%r3\n\ + addi %r7,%r6,4\n\ + lwbrx %r9,%r7,%r3\n\ + stwx %r8,%r7,%r4\n\ + stwx %r9,%r6,%r4\n\ + addi %r6,%r6,8\n\ + addi %r10,%r10,32 # Update cache-hint offset\n\ + bc 0,lt,.loop\n\ +.exit:\n\ + or %r3,%r4,%r4\n\ + blr\n\ +"); + +#else + +#include +#include + +void *swab64(void *from,void *to,ssize_t len) +{ +int i; +struct { + uint32_t u32[2]; +} *u64in=from,*u64out=to; +uint32_t tmp1,tmp2; + +for(i=0;i<(len>>3);i++) { + tmp1=u64in[i].u32[0]; + tmp2=u64in[i].u32[1]; + u64out[i].u32[0]=((tmp2&0xff)<<24)| + ((tmp2&0xff00)<<8)| + ((tmp2&0xff0000)>>8)| + ((tmp2&0xff000000)>>24); + u64out[i].u32[1]=((tmp1&0xff)<<24)| + ((tmp1&0xff00)<<8)| + ((tmp1&0xff0000)>>8)| + ((tmp1&0xff000000)>>24); +} + +return(to); +} + +#endif + +/* vi:set ts=3: */ +