- Added a directory to hold contributed code which has not been integrated

with the library yet. - The byteswap code was contributed by Peter Bengtsson. Thank you very much! git-svn-id: file:///Users/olsen/Code/migration-svn-zu-git/logical-line-staging/clib2/trunk@15163 87f5fb63-7c3d-0410-a384-fd976d0f7a62
2025-12-08 14:59:05 +00:00 · 2006-11-13 09:49:49 +00:00
parent 7e1d5d6f6a
commit 66303e9ba2
10 changed files with 696 additions and 0 deletions
--- a/library/contrib/byteswap/byteswap.h
+++ b/library/contrib/byteswap/byteswap.h
@@ -0,0 +1,183 @@
+
+#ifndef	__BYTESWAP_H
+#define	__BYTESWAP_H
+
+#include <sys/types.h>
+#include <stdint.h>
+
+#if defined(__GNUC__)
+#define	__CONST_FUNC	__attribute__((const))
+#else
+#define	__CONST_FUNC	/* Nothing */
+#endif
+
+/* Single value byteswap functions. */
+
+extern __CONST_FUNC uint16_t bswap16(uint16_t);
+extern __CONST_FUNC uint32_t bswap24(uint32_t);
+extern __CONST_FUNC uint32_t bswap32(uint32_t);
+
+#ifdef	INT64_MIN
+extern __CONST_FUNC uint64_t bswap64(uint64_t);
+#endif
+
+/* Block byteswap functions. The swab() function usually resides in unistd.h, so perhaps it should be moved there? */
+/* NOTE: Contrary to the standard swab(), this version returns the "to" pointer and the pointers are not restrict
+ * qualified - so swapping buffer-contents in-place is supported.
+ * Also, swab24(), swab32() and swab64() are non-standard functions.
+ */
+
+extern void *swab(void *from,void *to,ssize_t nbytes);
+extern void *swab24(void *from,void *to,ssize_t nbytes);	/* Same as swab(), but operates on 24-bit words instead. */
+extern void *swab32(void *from,void *to,ssize_t nbytes);	/* Same as swab(), but operates on 32-bit words instead. */
+extern void *swab64(void *from,void *to,ssize_t nbytes);	/* Same as swab(), but operates on 64-bit words instead. */
+
+#define	swab16(x)	swab(x)
+
+/*
+ * Optimized inline-versions for the single-value functions follow.
+ * Only GCC+PPC and GCC+m68k support for now.
+ */
+
+#if defined(__GNUC__)
+
+/* Select implementation. */
+
+#define	bswap16(x)	(__builtin_constant_p(x))?__const_swap16(x):__swap16(x)
+#define	bswap24(x)	(__builtin_constant_p(x))?__const_swap24(x):__swap24(x)
+#define	bswap32(x)	(__builtin_constant_p(x))?__const_swap32(x):__swap32(x)
+#define	bswap64(x)	(__builtin_constant_p(x))?__const_swap64(x):__swap64(x)
+
+/* Assembler implementations */
+
+#if defined(__PPC__)
+
+static __inline__  __CONST_FUNC uint16_t __swap16(uint16_t u16) {
+	uint_fast16_t result;
+	__asm__("\
+		rlwinm	%[result],%[u16],8,16,24\n\
+		rlwimi	%[result],%[u16],24,24,31\n\
+	":[result]"=&r"(result):[u16]"r"(u16));
+	return(result);
+}
+
+static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) {
+	uint_fast32_t result;
+	__asm__("\
+	rlwinm	%[result],%[u32],16,8,31\n\
+	rlwimi	%[result],%[u32],0,16,24\n\
+	":[result]"=&r"(result):[u32]"r"(u32));
+	return(result);
+}
+
+static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) {
+	uint_fast32_t result;
+	__asm__("\
+	rlwinm	%[result],%[u32],8,8,31\n\
+	rlwimi	%[result],%[u32],24,0,7\n\
+	rlwimi	%[result],%[u32],24,16,23\n\
+	":[result]"=&r"(result):[u32]"r"(u32));
+	return(result);
+}
+
+/*
+ * Note: __swap64() might perhaps be optimized a bit more by scheduling the
+ * instructions to alternate register-use, but this instead means there
+ * are two less registers free since "u64" and "result" may no longer overlap.
+ * Decisions, decisions....
+ */
+
+static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) {
+	uint_fast64_t result;
+	uint_fast32_t tmp;
+	__asm__("\
+	rlwinm	%[tmp],%[u64],8,8,31\n\
+	rlwimi	%[tmp],%[u64],24,0,7\n\
+	rlwimi	%[tmp],%[u64],24,16,23\n\
+	rlwinm	%[result],%L[u64],8,8,31\n\
+	rlwimi	%[result],%L[u64],24,0,7\n\
+	rlwimi	%[result],%L[u64],24,16,23\n\
+	or			%L[result],%[tmp],%[tmp]\n\
+	":[result]"=r"(result),[tmp]"=&r"(tmp):[u64]"r"(u64));
+	return(result);
+}
+
+#elif defined(__mc68020__)
+
+static __inline__  __CONST_FUNC uint16_t __swap16(uint16_t u16) {
+	__asm__("\
+		rol.w	#8,%[u16]\n\
+	":[u16]"+d"(u16)::"cc");
+	return(u16);
+}
+
+static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) {
+	__asm__("\
+		rol.w	#8,%[u32]\n\
+		swap	%[u32]\n\
+		rol.w	#8,%[u32]\n\
+		ror.l	#8,%[u32]\n\
+	":[u32]"+d"(u32)::"cc");
+	return(u32);
+}
+
+static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) {
+	__asm__("\
+		rol.w	#8,%[u32]\n\
+		swap	%[u32]\n\
+		rol.w	#8,%[u32]\n\
+	":[u32]"+d"(u32)::"cc");
+	return(u32);
+}
+
+static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) {
+	__asm__("\
+		rol.w	#8,%[u64]\n\
+		rol.w	#8,%L[u64]\n\
+		swap	%[u64]\n\
+		swap	%L[u64]\n\
+		rol.w	#8,%[u64]\n\
+		rol.w	#8,%L[u64]\n\
+		exg	%[u64],L%[u64]\n\
+	":[u64]"+d"(u64)::"cc");
+	return(u64);
+}
+
+#else
+/* Unknown or undefined architecture. Perhaps compiling with "-strict -ansi", but should not use this header then anyway. */
+#undef	bswap16
+#undef	bswap24
+#undef	bswap32
+#undef	bswap64
+#define	bswap16(x)	(__builtin_constant_p(x))?__const_swap16(x):bswap16(x)
+#define	bswap24(x)	(__builtin_constant_p(x))?__const_swap24(x):bswap24(x)
+#define	bswap32(x)	(__builtin_constant_p(x))?__const_swap32(x):bswap32(x)
+#define	bswap64(x)	(__builtin_constant_p(x))?__const_swap64(x):bswap64(x)
+#endif
+
+/* C implementations for constant values */
+
+static __inline__ uint16_t __const_swap16(uint16_t u16) {
+	return(u16>>8|u16<<8);
+}
+
+static __inline__ uint32_t __const_swap24(uint32_t u32) {
+	return(((u32&0xff)<<16)|((u32&0xff00))|((u32&0xff0000)>>16));
+}
+
+static __inline__ uint32_t __const_swap32(uint32_t u32) {
+	return(((u32&0xff)<<24)|((u32&0xff00)<<8)|((u32&0xff0000)>>8)|((u32&0xff000000)>>24));
+}
+
+static __inline__ uint64_t __const_swap64(uint64_t u64) {
+	return(((u64&0xffLL)<<56)|((u64&0xff00LL)<<40)|((u64&0xff0000LL)<<24)|((u64&0xff000000LL)<<8)|
+		((u64&0xff00000000LL)>>8)|((u64&0xff0000000000LL)>>24)|((u64&0xff000000000000LL)>>40)|((u64&0xff00000000000000LL)>>56));
+}
+
+#endif	/* __GNUC__ */
+
+
+#endif	/* __BYTESWAP_H */
+
+/* vi:set ts=3: */
+
--- a/library/contrib/byteswap/byteswap_bswap16.c
+++ b/library/contrib/byteswap/byteswap_bswap16.c
@@ -0,0 +1,28 @@
+
+#if defined(__PPC__) && defined(__GNUC__)
+
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl bswap16\n\
+	.type	bswap16, @function\n\
+bswap16:\n\
+#	rlwinm	%r4,%r3,8,16,24\n\
+#	rlwimi	%r4,%r3,24,24,31\n\
+#	or			%r3,%r4,%r4\n\
+	rlwimi	%r3,%r3,16,8,15\n\
+	srwi		%r3,%r3,8\n\
+	blr\n\
+");
+
+#else
+
+#include <stdint.h>
+
+uint16_t bswap16(uint16_t u16)
+{
+return(u16>>8|u16<<8);
+}
+
+#endif
+
--- a/library/contrib/byteswap/byteswap_bswap24.c
+++ b/library/contrib/byteswap/byteswap_bswap24.c
@@ -0,0 +1,29 @@
+
+#if defined(__PPC__) && defined(__GNUC__)
+
+asm("	.text\n\
+	.align 2\n\
+	.globl bswap24\n\
+	.type	bswap24, @function\n\
+bswap32:\n\
+	rlwinm	%r4,%r3,16,8,31\n\
+	rlwimi	%r4,%r3,0,16,24\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+#else
+
+#include <stdint.h>
+
+uint32_t bswap24(uint32_t u32)
+{
+	return(
+		((u32&0xff)<<16)|
+		((u32&0xff00))|
+		((u32&0xff0000)>>16)
+	);
+}
+
+#endif
+
--- a/library/contrib/byteswap/byteswap_bswap32.c
+++ b/library/contrib/byteswap/byteswap_bswap32.c
@@ -0,0 +1,31 @@
+
+#if defined(__PPC__) && defined(__GNUC__)
+
+asm("	.text\n\
+	.align 2\n\
+	.globl bswap32\n\
+	.type	bswap32, @function\n\
+bswap32:\n\
+	rlwinm	%r4,%r3,8,8,31\n\
+	rlwimi	%r4,%r3,24,0,7\n\
+	rlwimi	%r4,%r3,24,16,23\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+#else
+
+#include <stdint.h>
+
+uint32_t bswap32(uint32_t u32)
+{
+	return(
+		((u32&0xff)<<24)|
+		((u32&0xff00)<<8)|
+		((u32&0xff0000)>>8)|
+		((u32&0xff000000)>>24)
+	);
+}
+
+#endif
+
--- a/library/contrib/byteswap/byteswap_bswap64.c
+++ b/library/contrib/byteswap/byteswap_bswap64.c
@@ -0,0 +1,48 @@
+
+#if defined(USE_64_BIT_INTS)
+
+#if defined(__PPC__) && defined(__GNUC__)
+
+asm("	.text\n\
+	.align 2\n\
+	.globl bswap64\n\
+	.type	bswap64, @function\n\
+bswap64:\n\
+	rlwinm	%r5,%r3,8,8,31\n\
+	rlwimi	%r5,%r3,24,0,7\n\
+	rlwimi	%r5,%r3,24,16,23\n\
+	rlwinm	%r3,%r4,8,8,31\n\
+	rlwimi	%r3,%r4,24,0,7\n\
+	rlwimi	%r3,%r4,24,16,23\n\
+	or			%r4,%r5,%r5\n\
+	blr\n\
+");
+
+#else
+
+#include <stdint.h>
+
+uint64_t bswap64(uint64_t u64)
+{
+union {
+	uint64_t ll;
+	uint32_t l[2];
+} v={.ll=u64};
+uint32_t tmp;
+tmp=v.l[0];
+v.l[0]=((v.l[1]&0xff)<<24)|
+		((v.l[1]&0xff00)<<8)|
+		((v.l[1]&0xff0000)>>8)|
+		((v.l[1]&0xff000000)>>24);
+v.l[1]=((tmp&0xff)<<24)|
+		((tmp&0xff00)<<8)|
+		((tmp&0xff0000)>>8)|
+		((tmp&0xff000000)>>24);
+return(v.ll);
+}
+
+#endif
+
+#endif
+
+
--- a/library/contrib/byteswap/byteswap_swab.c
+++ b/library/contrib/byteswap/byteswap_swab.c
@@ -0,0 +1,70 @@
+
+#if defined(__GNUC__) && defined(__PPC__)
+
+/* r3=from, r4=to, r5=len/count, r6=index, r7=load/store/temp */
+
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl swab\n\
+	.type swab,@function\n\
+swab:\n\
+	dcbt		0,%r3\n\
+	srawi.	%r5,%r5,1\n\
+	bc			4,gt,.exit\n\
+	andi.		%r7,%r3,3		# Check if we start on an address evenly divisible by 4.\n\
+	li			%r6,0\n\
+	bc			4,gt,.preploop\n\
+	lhbrx		%r7,%r6,%r3		# Fix alignment if needed.\n\
+	sthx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,2\n\
+	subi		%r5,%r5,1\n\
+.preploop:\n\
+	andi.		%r7,%r5,1		# Check if even or odd number of 16-bit words.\n\
+	srawi		%r5,%r5,1		# Number of 32-bit words to half-swap.\n\
+	mtctr		%r5\n\
+	bc			12,gt,.oddloop	# Jump to loop for odd number of 16-bit words.\n\
+.loop:							# Loop is 'unrolled' by reading/writing 32-bit words.\n\
+	lwbrx		%r7,%r6,%r3\n\
+	rotlwi	%r7,%r7,16\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	bc			0,lt,.loop\n\
+.exit:\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+.oddloop:\n\
+	lwbrx		%r7,%r6,%r3\n\
+	rotlwi	%r7,%r7,16\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	bc			0,lt,.oddloop\n\
+	sub		%r6,%r6,2\n\
+	lhbrx		%r7,%r6,%r3		# Fix last 16-bit word.\n\
+	sthx		%r7,%r6,%r4\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+#else
+
+#include <sys/types.h>
+#include <stdint.h>
+
+void *swab(void *from,void *to,ssize_t len)
+{
+int i;
+uint16_t u16,*u16in=from,*u16out=to;
+
+for(i=0;i<(len>>1);i++) {
+	u16=u16in[i];
+	u16out[i]=u16>>8|u16<<8;
+}
+
+return(u16out);
+}
+
+#endif
+
+
+
--- a/library/contrib/byteswap/byteswap_swab24.c
+++ b/library/contrib/byteswap/byteswap_swab24.c
@@ -0,0 +1,91 @@
+
+#if defined(__GNUC__) && defined(__PPC__)
+
+/* r3=from, r4=to, r5=len/remaining, r6/r7=index & r7=temp, r8/r9/r10=read/write temp */
+
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl swab24\n\
+	.type swab24,@function\n\
+swab24:\n\
+	dcbt		0,%r3\n\
+	li			%r7,3\n\
+	divwu		%r5,%r5,%r7\n\
+	andi.		%r7,%r5,3\n\
+	srawi.	%r5,%r5,2\n\
+	mtctr		%r5\n\
+	or			%r5,%r7,%r7\n\
+	li			%r6,0\n\
+	bc			4,gt,.postfix\n\
+.loop:\n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwzx		%r9,%r7,%r3\n\
+	addi		%r7,%r6,8\n\
+	lwbrx		%r10,%r7,%r3\n\
+	rotlwi	%r8,%r8,8\n\
+	or			%r7,%r9,%r9\n\
+	rlwimi	%r9,%r8,16,8,15\n\
+	rlwimi	%r9,%r10,8,16,23\n\
+	rlwimi	%r8,%r7,16,24,31\n\
+	rotrwi	%r10,%r10,8\n\
+	rlwimi	%r10,%r7,16,0,7\n\
+	stwx		%r8,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	stwx		%r10,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	bc			0,lt,.loop\n\
+.postfix:	# Fix any remaining 24-bit words (number of remaining words in r5).\n\
+	or.		%r5,%r5,%r5\n\
+	bc			4,gt,.exit\n\
+	mtctr		%r5\n\
+	add		%r3,%r3,%r6\n\
+	add		%r6,%r4,%r6\n\
+	subi		%r3,%r3,1\n\
+.fixloop:\n\
+	lbzu		%r7,1(%r3)\n\
+	lbzu		%r8,1(%r3)\n\
+	lbzu		%r9,1(%r3)\n\
+	stb		%r7,2(%r6)\n\
+	stb		%r8,1(%r6)\n\
+	stb		%r9,0(%r6)\n\
+	addi		%r6,%r6,3\n\
+	bc			0,lt,.fixloop\n\
+.exit:\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+#else
+
+#include <sys/types.h>
+#include <stdint.h>
+
+/*
+ * Ugh, this is really very, very ineffiecient.
+ * (But simple, understandable and safe)
+ */
+
+void *swab24(void *from,void *to,ssize_t len)
+{
+uint8_t *src=from,B0,B1,B2,*dst=to;
+int i;
+
+for(i=0;i<len;i+=3) {
+	B0=src[i];
+	B1=src[i+1];
+	B2=src[i+2];
+	dst[i]=B2;
+	dst[i+1]=B1;
+	dst[i+2]=B0;
+}
+
+return(to);
+}
+
+#endif
+
+
--- a/library/contrib/byteswap/byteswap_swab32.c
+++ b/library/contrib/byteswap/byteswap_swab32.c
@@ -0,0 +1,112 @@
+
+#if defined(__GNUC__) && defined(__PPC__)
+
+/* r3=from, r4=to, r5=len, r6=index, r7=load/store temp */
+
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl swab32\n\
+	.type swab32,@function\n\
+swab32:\n\
+	srawi.	%r5,%r5,2\n\
+	li			%r6,0\n\
+	bc			4,gt,.exit\n\
+	mtctr		%r5\n\
+.loop:\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	bc			0,lt,.loop\n\
+.exit:\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+/* r3=from, r4=to, r5=len/temp, r6=index, r7=load/store temp, r8=cache hint
+ *
+ * The unrolled, cache-hinting version appears to be about 4.5% faster, but
+ * in this case I opted for the smaller implementation. swab64() appears to
+ * gain more from cache-hinting - probably because of it using more registers
+ * for intermediate storage.
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl swab32\n\
+	.type swab32,@function\n\
+swab32:\n\
+	dcbt		0,%r3\n\
+	andi.		%r8,%r5,31		# The number of bytes handled in '.pre'. Used for prefetch hint.\n\
+	srawi		%r5,%r5,2		# Convert bytes-># of 32-bit words\n\
+	andi.		%r7,%r5,7\n\
+	li			%r6,0\n\
+	bc			4,gt,.preploop\n\
+	mtctr		%r7\n\
+.pre:								# One 32-bit word at a time until we have (nLeft%8)==0 \n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	bc			0,lt,.pre\n\
+.preploop:\n\
+	srawi.	%r5,%r5,3		# Divide by 8 again to get number of loops.\n\
+	addi		%r8,%r8,32		# Start address for next loop (from r3).\n\
+	bc			4,gt,.exit\n\
+	mtctr		%r5\n\
+.loop:							# Loop unrolled 8 times = 32 bytes = 1 cache-line (except on the 970).\n\
+	dcbt		%r8,%r3			# Cache hint (prefetch) for the next loop\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	lwbrx		%r7,%r6,%r3\n\
+	stwx		%r7,%r6,%r4\n\
+	addi		%r6,%r6,4\n\
+	addi		%r8,%r8,32		# Update cache-hint offset\n\
+	bc			0,lt,.loop\n\
+.exit:\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+*/
+
+#else
+
+#include <sys/types.h>
+#include <stdint.h>
+
+void *swab32(void *from,void *to,ssize_t len)
+{
+int i;
+uint32_t *u32in=from,*u32out=to,tmp;
+
+for(i=0;i<(len>>2);i++) {
+	tmp=u32in[i];
+	u32out[i]=((tmp&0xff)<<24)|
+		((tmp&0xff00)<<8)|
+		((tmp&0xff0000)>>8)|
+		((tmp&0xff000000)>>24);
+}
+
+return(to);
+}
+
+#endif
+
+
--- a/library/contrib/byteswap/byteswap_swab64.c
+++ b/library/contrib/byteswap/byteswap_swab64.c
@@ -0,0 +1,101 @@
+
+#if defined(__GNUC__) && defined(__PPC__)
+
+/* r3=from, r4=to, r5=len/temp, r6/r7=index, r8/r9=load/store temp, r10=cache hint */
+
+/* This version is unrolled and uses cache-hinting. It appears to gain about 10%
+ * over a non-unrolled, non-hinting version.
+ */
+
+asm("\
+	.text\n\
+	.align 2\n\
+	.globl swab64\n\
+	.type swab64,@function\n\
+swab64:\n\
+	dcbt		0,%r3\n\
+	andi.		%r10,%r5,31		# The number of bytes handled in '.pre'. Used for prefetch hint.\n\
+	srawi		%r5,%r5,3		# Convert bytes-># of 64-bit words\n\
+	andi.		%r7,%r5,3\n\
+	li			%r6,0\n\
+	bc			4,gt,.preploop\n\
+	mtctr		%r7\n\
+.pre:								# One 64-bit word at a time until we have (nLeft%4)==0 \n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwbrx		%r9,%r7,%r3\n\
+	stwx		%r8,%r7,%r4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,8\n\
+	bc			0,lt,.pre\n\
+.preploop:\n\
+	srawi.	%r5,%r5,2		# Divide by 4 again to get number of loops.\n\
+	addi		%r10,%r10,32	# Start address for next loop.\n\
+	bc			4,gt,.exit\n\
+	mtctr		%r5\n\
+.loop:							# Loop unrolled 4 times = 32 bytes = 1 cache-line (except on the 970).\n\
+	dcbt		%r10,%r3			# Cache hint (prefetch) for the next iteration\n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwbrx		%r9,%r7,%r3\n\
+	stwx		%r8,%r7,%r4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,8\n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwbrx		%r9,%r7,%r3\n\
+	stwx		%r8,%r7,%r4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,8\n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwbrx		%r9,%r7,%r3\n\
+	stwx		%r8,%r7,%r4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,8\n\
+	lwbrx		%r8,%r6,%r3\n\
+	addi		%r7,%r6,4\n\
+	lwbrx		%r9,%r7,%r3\n\
+	stwx		%r8,%r7,%r4\n\
+	stwx		%r9,%r6,%r4\n\
+	addi		%r6,%r6,8\n\
+	addi		%r10,%r10,32		# Update cache-hint offset\n\
+	bc			0,lt,.loop\n\
+.exit:\n\
+	or			%r3,%r4,%r4\n\
+	blr\n\
+");
+
+#else
+
+#include <sys/types.h>
+#include <stdint.h>
+
+void *swab64(void *from,void *to,ssize_t len)
+{
+int i;
+struct {
+	uint32_t	u32[2];
+} *u64in=from,*u64out=to;
+uint32_t tmp1,tmp2;
+
+for(i=0;i<(len>>3);i++) {
+	tmp1=u64in[i].u32[0];
+	tmp2=u64in[i].u32[1];
+	u64out[i].u32[0]=((tmp2&0xff)<<24)|
+		((tmp2&0xff00)<<8)|
+		((tmp2&0xff0000)>>8)|
+		((tmp2&0xff000000)>>24);
+	u64out[i].u32[1]=((tmp1&0xff)<<24)|
+		((tmp1&0xff00)<<8)|
+		((tmp1&0xff0000)>>8)|
+		((tmp1&0xff000000)>>24);
+}
+
+return(to);
+}
+
+#endif
+
+/* vi:set ts=3: */
+