1
0
mirror of https://github.com/adtools/clib2.git synced 2025-12-08 14:59:05 +00:00

- Added a directory to hold contributed code which has not been integrated

with the library yet.

- The byteswap code was contributed by Peter Bengtsson. Thank you very much!


git-svn-id: file:///Users/olsen/Code/migration-svn-zu-git/logical-line-staging/clib2/trunk@15163 87f5fb63-7c3d-0410-a384-fd976d0f7a62
This commit is contained in:
Olaf Barthel
2006-11-13 09:49:49 +00:00
parent 7e1d5d6f6a
commit 66303e9ba2
10 changed files with 696 additions and 0 deletions

View File

@@ -0,0 +1,183 @@
#ifndef __BYTESWAP_H
#define __BYTESWAP_H
#include <sys/types.h>
#include <stdint.h>
#if defined(__GNUC__)
#define __CONST_FUNC __attribute__((const))
#else
#define __CONST_FUNC /* Nothing */
#endif
/* Single value byteswap functions. */
extern __CONST_FUNC uint16_t bswap16(uint16_t);
extern __CONST_FUNC uint32_t bswap24(uint32_t);
extern __CONST_FUNC uint32_t bswap32(uint32_t);
#ifdef INT64_MIN
extern __CONST_FUNC uint64_t bswap64(uint64_t);
#endif
/* Block byteswap functions. The swab() function usually resides in unistd.h, so perhaps it should be moved there? */
/* NOTE: Contrary to the standard swab(), this version returns the "to" pointer and the pointers are not restrict
* qualified - so swapping buffer-contents in-place is supported.
* Also, swab24(), swab32() and swab64() are non-standard functions.
*/
extern void *swab(void *from,void *to,ssize_t nbytes);
extern void *swab24(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 24-bit words instead. */
extern void *swab32(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 32-bit words instead. */
extern void *swab64(void *from,void *to,ssize_t nbytes); /* Same as swab(), but operates on 64-bit words instead. */
#define swab16(x) swab(x)
/*
* Optimized inline-versions for the single-value functions follow.
* Only GCC+PPC and GCC+m68k support for now.
*/
#if defined(__GNUC__)
/* Select implementation. */
#define bswap16(x) (__builtin_constant_p(x))?__const_swap16(x):__swap16(x)
#define bswap24(x) (__builtin_constant_p(x))?__const_swap24(x):__swap24(x)
#define bswap32(x) (__builtin_constant_p(x))?__const_swap32(x):__swap32(x)
#define bswap64(x) (__builtin_constant_p(x))?__const_swap64(x):__swap64(x)
/* Assembler implementations */
#if defined(__PPC__)
static __inline__ __CONST_FUNC uint16_t __swap16(uint16_t u16) {
uint_fast16_t result;
__asm__("\
rlwinm %[result],%[u16],8,16,24\n\
rlwimi %[result],%[u16],24,24,31\n\
":[result]"=&r"(result):[u16]"r"(u16));
return(result);
}
static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) {
uint_fast32_t result;
__asm__("\
rlwinm %[result],%[u32],16,8,31\n\
rlwimi %[result],%[u32],0,16,24\n\
":[result]"=&r"(result):[u32]"r"(u32));
return(result);
}
static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) {
uint_fast32_t result;
__asm__("\
rlwinm %[result],%[u32],8,8,31\n\
rlwimi %[result],%[u32],24,0,7\n\
rlwimi %[result],%[u32],24,16,23\n\
":[result]"=&r"(result):[u32]"r"(u32));
return(result);
}
/*
* Note: __swap64() might perhaps be optimized a bit more by scheduling the
* instructions to alternate register-use, but this instead means there
* are two less registers free since "u64" and "result" may no longer overlap.
* Decisions, decisions....
*/
static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) {
uint_fast64_t result;
uint_fast32_t tmp;
__asm__("\
rlwinm %[tmp],%[u64],8,8,31\n\
rlwimi %[tmp],%[u64],24,0,7\n\
rlwimi %[tmp],%[u64],24,16,23\n\
rlwinm %[result],%L[u64],8,8,31\n\
rlwimi %[result],%L[u64],24,0,7\n\
rlwimi %[result],%L[u64],24,16,23\n\
or %L[result],%[tmp],%[tmp]\n\
":[result]"=r"(result),[tmp]"=&r"(tmp):[u64]"r"(u64));
return(result);
}
#elif defined(__mc68020__)
static __inline__ __CONST_FUNC uint16_t __swap16(uint16_t u16) {
__asm__("\
rol.w #8,%[u16]\n\
":[u16]"+d"(u16)::"cc");
return(u16);
}
static __inline__ __CONST_FUNC uint32_t __swap24(uint32_t u32) {
__asm__("\
rol.w #8,%[u32]\n\
swap %[u32]\n\
rol.w #8,%[u32]\n\
ror.l #8,%[u32]\n\
":[u32]"+d"(u32)::"cc");
return(u32);
}
static __inline__ __CONST_FUNC uint32_t __swap32(uint32_t u32) {
__asm__("\
rol.w #8,%[u32]\n\
swap %[u32]\n\
rol.w #8,%[u32]\n\
":[u32]"+d"(u32)::"cc");
return(u32);
}
static __inline__ __CONST_FUNC uint64_t __swap64(uint64_t u64) {
__asm__("\
rol.w #8,%[u64]\n\
rol.w #8,%L[u64]\n\
swap %[u64]\n\
swap %L[u64]\n\
rol.w #8,%[u64]\n\
rol.w #8,%L[u64]\n\
exg %[u64],L%[u64]\n\
":[u64]"+d"(u64)::"cc");
return(u64);
}
#else
/* Unknown or undefined architecture. Perhaps compiling with "-strict -ansi", but should not use this header then anyway. */
#undef bswap16
#undef bswap24
#undef bswap32
#undef bswap64
#define bswap16(x) (__builtin_constant_p(x))?__const_swap16(x):bswap16(x)
#define bswap24(x) (__builtin_constant_p(x))?__const_swap24(x):bswap24(x)
#define bswap32(x) (__builtin_constant_p(x))?__const_swap32(x):bswap32(x)
#define bswap64(x) (__builtin_constant_p(x))?__const_swap64(x):bswap64(x)
#endif
/* C implementations for constant values */
static __inline__ uint16_t __const_swap16(uint16_t u16) {
return(u16>>8|u16<<8);
}
static __inline__ uint32_t __const_swap24(uint32_t u32) {
return(((u32&0xff)<<16)|((u32&0xff00))|((u32&0xff0000)>>16));
}
static __inline__ uint32_t __const_swap32(uint32_t u32) {
return(((u32&0xff)<<24)|((u32&0xff00)<<8)|((u32&0xff0000)>>8)|((u32&0xff000000)>>24));
}
static __inline__ uint64_t __const_swap64(uint64_t u64) {
return(((u64&0xffLL)<<56)|((u64&0xff00LL)<<40)|((u64&0xff0000LL)<<24)|((u64&0xff000000LL)<<8)|
((u64&0xff00000000LL)>>8)|((u64&0xff0000000000LL)>>24)|((u64&0xff000000000000LL)>>40)|((u64&0xff00000000000000LL)>>56));
}
#endif /* __GNUC__ */
#endif /* __BYTESWAP_H */
/* vi:set ts=3: */

View File

@@ -0,0 +1,28 @@
#if defined(__PPC__) && defined(__GNUC__)
asm("\
.text\n\
.align 2\n\
.globl bswap16\n\
.type bswap16, @function\n\
bswap16:\n\
# rlwinm %r4,%r3,8,16,24\n\
# rlwimi %r4,%r3,24,24,31\n\
# or %r3,%r4,%r4\n\
rlwimi %r3,%r3,16,8,15\n\
srwi %r3,%r3,8\n\
blr\n\
");
#else
#include <stdint.h>
uint16_t bswap16(uint16_t u16)
{
return(u16>>8|u16<<8);
}
#endif

View File

@@ -0,0 +1,29 @@
#if defined(__PPC__) && defined(__GNUC__)
asm(" .text\n\
.align 2\n\
.globl bswap24\n\
.type bswap24, @function\n\
bswap32:\n\
rlwinm %r4,%r3,16,8,31\n\
rlwimi %r4,%r3,0,16,24\n\
or %r3,%r4,%r4\n\
blr\n\
");
#else
#include <stdint.h>
uint32_t bswap24(uint32_t u32)
{
return(
((u32&0xff)<<16)|
((u32&0xff00))|
((u32&0xff0000)>>16)
);
}
#endif

View File

@@ -0,0 +1,31 @@
#if defined(__PPC__) && defined(__GNUC__)
asm(" .text\n\
.align 2\n\
.globl bswap32\n\
.type bswap32, @function\n\
bswap32:\n\
rlwinm %r4,%r3,8,8,31\n\
rlwimi %r4,%r3,24,0,7\n\
rlwimi %r4,%r3,24,16,23\n\
or %r3,%r4,%r4\n\
blr\n\
");
#else
#include <stdint.h>
uint32_t bswap32(uint32_t u32)
{
return(
((u32&0xff)<<24)|
((u32&0xff00)<<8)|
((u32&0xff0000)>>8)|
((u32&0xff000000)>>24)
);
}
#endif

View File

@@ -0,0 +1,48 @@
#if defined(USE_64_BIT_INTS)
#if defined(__PPC__) && defined(__GNUC__)
asm(" .text\n\
.align 2\n\
.globl bswap64\n\
.type bswap64, @function\n\
bswap64:\n\
rlwinm %r5,%r3,8,8,31\n\
rlwimi %r5,%r3,24,0,7\n\
rlwimi %r5,%r3,24,16,23\n\
rlwinm %r3,%r4,8,8,31\n\
rlwimi %r3,%r4,24,0,7\n\
rlwimi %r3,%r4,24,16,23\n\
or %r4,%r5,%r5\n\
blr\n\
");
#else
#include <stdint.h>
uint64_t bswap64(uint64_t u64)
{
union {
uint64_t ll;
uint32_t l[2];
} v={.ll=u64};
uint32_t tmp;
tmp=v.l[0];
v.l[0]=((v.l[1]&0xff)<<24)|
((v.l[1]&0xff00)<<8)|
((v.l[1]&0xff0000)>>8)|
((v.l[1]&0xff000000)>>24);
v.l[1]=((tmp&0xff)<<24)|
((tmp&0xff00)<<8)|
((tmp&0xff0000)>>8)|
((tmp&0xff000000)>>24);
return(v.ll);
}
#endif
#endif

View File

@@ -0,0 +1,70 @@
#if defined(__GNUC__) && defined(__PPC__)
/* r3=from, r4=to, r5=len/count, r6=index, r7=load/store/temp */
asm("\
.text\n\
.align 2\n\
.globl swab\n\
.type swab,@function\n\
swab:\n\
dcbt 0,%r3\n\
srawi. %r5,%r5,1\n\
bc 4,gt,.exit\n\
andi. %r7,%r3,3 # Check if we start on an address evenly divisible by 4.\n\
li %r6,0\n\
bc 4,gt,.preploop\n\
lhbrx %r7,%r6,%r3 # Fix alignment if needed.\n\
sthx %r7,%r6,%r4\n\
addi %r6,%r6,2\n\
subi %r5,%r5,1\n\
.preploop:\n\
andi. %r7,%r5,1 # Check if even or odd number of 16-bit words.\n\
srawi %r5,%r5,1 # Number of 32-bit words to half-swap.\n\
mtctr %r5\n\
bc 12,gt,.oddloop # Jump to loop for odd number of 16-bit words.\n\
.loop: # Loop is 'unrolled' by reading/writing 32-bit words.\n\
lwbrx %r7,%r6,%r3\n\
rotlwi %r7,%r7,16\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
bc 0,lt,.loop\n\
.exit:\n\
or %r3,%r4,%r4\n\
blr\n\
.oddloop:\n\
lwbrx %r7,%r6,%r3\n\
rotlwi %r7,%r7,16\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
bc 0,lt,.oddloop\n\
sub %r6,%r6,2\n\
lhbrx %r7,%r6,%r3 # Fix last 16-bit word.\n\
sthx %r7,%r6,%r4\n\
or %r3,%r4,%r4\n\
blr\n\
");
#else
#include <sys/types.h>
#include <stdint.h>
void *swab(void *from,void *to,ssize_t len)
{
int i;
uint16_t u16,*u16in=from,*u16out=to;
for(i=0;i<(len>>1);i++) {
u16=u16in[i];
u16out[i]=u16>>8|u16<<8;
}
return(u16out);
}
#endif

View File

@@ -0,0 +1,91 @@
#if defined(__GNUC__) && defined(__PPC__)
/* r3=from, r4=to, r5=len/remaining, r6/r7=index & r7=temp, r8/r9/r10=read/write temp */
asm("\
.text\n\
.align 2\n\
.globl swab24\n\
.type swab24,@function\n\
swab24:\n\
dcbt 0,%r3\n\
li %r7,3\n\
divwu %r5,%r5,%r7\n\
andi. %r7,%r5,3\n\
srawi. %r5,%r5,2\n\
mtctr %r5\n\
or %r5,%r7,%r7\n\
li %r6,0\n\
bc 4,gt,.postfix\n\
.loop:\n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwzx %r9,%r7,%r3\n\
addi %r7,%r6,8\n\
lwbrx %r10,%r7,%r3\n\
rotlwi %r8,%r8,8\n\
or %r7,%r9,%r9\n\
rlwimi %r9,%r8,16,8,15\n\
rlwimi %r9,%r10,8,16,23\n\
rlwimi %r8,%r7,16,24,31\n\
rotrwi %r10,%r10,8\n\
rlwimi %r10,%r7,16,0,7\n\
stwx %r8,%r6,%r4\n\
addi %r6,%r6,4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,4\n\
stwx %r10,%r6,%r4\n\
addi %r6,%r6,4\n\
bc 0,lt,.loop\n\
.postfix: # Fix any remaining 24-bit words (number of remaining words in r5).\n\
or. %r5,%r5,%r5\n\
bc 4,gt,.exit\n\
mtctr %r5\n\
add %r3,%r3,%r6\n\
add %r6,%r4,%r6\n\
subi %r3,%r3,1\n\
.fixloop:\n\
lbzu %r7,1(%r3)\n\
lbzu %r8,1(%r3)\n\
lbzu %r9,1(%r3)\n\
stb %r7,2(%r6)\n\
stb %r8,1(%r6)\n\
stb %r9,0(%r6)\n\
addi %r6,%r6,3\n\
bc 0,lt,.fixloop\n\
.exit:\n\
or %r3,%r4,%r4\n\
blr\n\
");
#else
#include <sys/types.h>
#include <stdint.h>
/*
* Ugh, this is really very, very ineffiecient.
* (But simple, understandable and safe)
*/
void *swab24(void *from,void *to,ssize_t len)
{
uint8_t *src=from,B0,B1,B2,*dst=to;
int i;
for(i=0;i<len;i+=3) {
B0=src[i];
B1=src[i+1];
B2=src[i+2];
dst[i]=B2;
dst[i+1]=B1;
dst[i+2]=B0;
}
return(to);
}
#endif

View File

@@ -0,0 +1,112 @@
#if defined(__GNUC__) && defined(__PPC__)
/* r3=from, r4=to, r5=len, r6=index, r7=load/store temp */
asm("\
.text\n\
.align 2\n\
.globl swab32\n\
.type swab32,@function\n\
swab32:\n\
srawi. %r5,%r5,2\n\
li %r6,0\n\
bc 4,gt,.exit\n\
mtctr %r5\n\
.loop:\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
bc 0,lt,.loop\n\
.exit:\n\
or %r3,%r4,%r4\n\
blr\n\
");
/* r3=from, r4=to, r5=len/temp, r6=index, r7=load/store temp, r8=cache hint
*
* The unrolled, cache-hinting version appears to be about 4.5% faster, but
* in this case I opted for the smaller implementation. swab64() appears to
* gain more from cache-hinting - probably because of it using more registers
* for intermediate storage.
asm("\
.text\n\
.align 2\n\
.globl swab32\n\
.type swab32,@function\n\
swab32:\n\
dcbt 0,%r3\n\
andi. %r8,%r5,31 # The number of bytes handled in '.pre'. Used for prefetch hint.\n\
srawi %r5,%r5,2 # Convert bytes-># of 32-bit words\n\
andi. %r7,%r5,7\n\
li %r6,0\n\
bc 4,gt,.preploop\n\
mtctr %r7\n\
.pre: # One 32-bit word at a time until we have (nLeft%8)==0 \n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
bc 0,lt,.pre\n\
.preploop:\n\
srawi. %r5,%r5,3 # Divide by 8 again to get number of loops.\n\
addi %r8,%r8,32 # Start address for next loop (from r3).\n\
bc 4,gt,.exit\n\
mtctr %r5\n\
.loop: # Loop unrolled 8 times = 32 bytes = 1 cache-line (except on the 970).\n\
dcbt %r8,%r3 # Cache hint (prefetch) for the next loop\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
lwbrx %r7,%r6,%r3\n\
stwx %r7,%r6,%r4\n\
addi %r6,%r6,4\n\
addi %r8,%r8,32 # Update cache-hint offset\n\
bc 0,lt,.loop\n\
.exit:\n\
or %r3,%r4,%r4\n\
blr\n\
");
*/
#else
#include <sys/types.h>
#include <stdint.h>
void *swab32(void *from,void *to,ssize_t len)
{
int i;
uint32_t *u32in=from,*u32out=to,tmp;
for(i=0;i<(len>>2);i++) {
tmp=u32in[i];
u32out[i]=((tmp&0xff)<<24)|
((tmp&0xff00)<<8)|
((tmp&0xff0000)>>8)|
((tmp&0xff000000)>>24);
}
return(to);
}
#endif

View File

@@ -0,0 +1,101 @@
#if defined(__GNUC__) && defined(__PPC__)
/* r3=from, r4=to, r5=len/temp, r6/r7=index, r8/r9=load/store temp, r10=cache hint */
/* This version is unrolled and uses cache-hinting. It appears to gain about 10%
* over a non-unrolled, non-hinting version.
*/
asm("\
.text\n\
.align 2\n\
.globl swab64\n\
.type swab64,@function\n\
swab64:\n\
dcbt 0,%r3\n\
andi. %r10,%r5,31 # The number of bytes handled in '.pre'. Used for prefetch hint.\n\
srawi %r5,%r5,3 # Convert bytes-># of 64-bit words\n\
andi. %r7,%r5,3\n\
li %r6,0\n\
bc 4,gt,.preploop\n\
mtctr %r7\n\
.pre: # One 64-bit word at a time until we have (nLeft%4)==0 \n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwbrx %r9,%r7,%r3\n\
stwx %r8,%r7,%r4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,8\n\
bc 0,lt,.pre\n\
.preploop:\n\
srawi. %r5,%r5,2 # Divide by 4 again to get number of loops.\n\
addi %r10,%r10,32 # Start address for next loop.\n\
bc 4,gt,.exit\n\
mtctr %r5\n\
.loop: # Loop unrolled 4 times = 32 bytes = 1 cache-line (except on the 970).\n\
dcbt %r10,%r3 # Cache hint (prefetch) for the next iteration\n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwbrx %r9,%r7,%r3\n\
stwx %r8,%r7,%r4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,8\n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwbrx %r9,%r7,%r3\n\
stwx %r8,%r7,%r4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,8\n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwbrx %r9,%r7,%r3\n\
stwx %r8,%r7,%r4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,8\n\
lwbrx %r8,%r6,%r3\n\
addi %r7,%r6,4\n\
lwbrx %r9,%r7,%r3\n\
stwx %r8,%r7,%r4\n\
stwx %r9,%r6,%r4\n\
addi %r6,%r6,8\n\
addi %r10,%r10,32 # Update cache-hint offset\n\
bc 0,lt,.loop\n\
.exit:\n\
or %r3,%r4,%r4\n\
blr\n\
");
#else
#include <sys/types.h>
#include <stdint.h>
void *swab64(void *from,void *to,ssize_t len)
{
int i;
struct {
uint32_t u32[2];
} *u64in=from,*u64out=to;
uint32_t tmp1,tmp2;
for(i=0;i<(len>>3);i++) {
tmp1=u64in[i].u32[0];
tmp2=u64in[i].u32[1];
u64out[i].u32[0]=((tmp2&0xff)<<24)|
((tmp2&0xff00)<<8)|
((tmp2&0xff0000)>>8)|
((tmp2&0xff000000)>>24);
u64out[i].u32[1]=((tmp1&0xff)<<24)|
((tmp1&0xff00)<<8)|
((tmp1&0xff0000)>>8)|
((tmp1&0xff000000)>>24);
}
return(to);
}
#endif
/* vi:set ts=3: */