Commit 086e9dc0 authored by James Hogan's avatar James Hogan

metag: Optimised library functions

Add optimised library functions for metag.
Signed-off-by: default avatarJames Hogan <james.hogan@imgtec.com>
parent f507758c
#ifndef _METAG_CHECKSUM_H
#define _METAG_CHECKSUM_H
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
/*
* the same as csum_partial, but copies from src while it
* checksums
*
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
extern __wsum csum_partial_copy(const void *src, void *dst, int len,
__wsum sum);
/*
* the same as csum_partial_copy, but copies from user space.
*
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum sum, int *csum_err);
#define csum_partial_copy_nocheck(src, dst, len, sum) \
csum_partial_copy((src), (dst), (len), (sum))
/*
* Fold a partial checksum
*/
static inline __sum16 csum_fold(__wsum csum)
{
u32 sum = (__force u32)csum;
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
return (__force __sum16)~sum;
}
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*/
extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
/*
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented
*/
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
unsigned short len,
unsigned short proto,
__wsum sum)
{
unsigned long len_proto = (proto + len) << 8;
asm ("ADD %0, %0, %1\n"
"ADDS %0, %0, %2\n"
"ADDCS %0, %0, #1\n"
"ADDS %0, %0, %3\n"
"ADDCS %0, %0, #1\n"
: "=d" (sum)
: "d" (daddr), "d" (saddr), "d" (len_proto),
"0" (sum)
: "cc");
return sum;
}
static inline __sum16
csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
unsigned short proto, __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
extern __sum16 ip_compute_csum(const void *buff, int len);
#endif /* _METAG_CHECKSUM_H */
#ifndef __ASM_DIV64_H__
#define __ASM_DIV64_H__
#include <asm-generic/div64.h>
extern u64 div_u64(u64 dividend, u64 divisor);
extern s64 div_s64(s64 dividend, s64 divisor);
#define div_u64 div_u64
#define div_s64 div_s64
#endif
#ifndef _METAG_STRING_H_
#define _METAG_STRING_H_
#define __HAVE_ARCH_MEMSET
extern void *memset(void *__s, int __c, size_t __count);
#define __HAVE_ARCH_MEMCPY
void *memcpy(void *__to, __const__ void *__from, size_t __n);
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
#endif /* _METAG_STRING_H_ */
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit arithmetic shift left routine.
!
.text
.global ___ashldi3
.type ___ashldi3,function
___ashldi3:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
CMP D1Ar3,#0 ! COUNT == 0
MOVEQ PC,D1RtP ! Yes, return
SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32
BGE $L10
!! Shift < 32
NEG D0Ar4,D0Ar4 ! N = - N
LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT
LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32)
OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP
SWAP D0Ar4,D1Ar3
LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT
MOV PC,D1RtP
$L10:
!! Shift >= 32
LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N
MOV D0Re0,#0 ! LO = 0
MOV PC,D1RtP
.size ___ashldi3,.-___ashldi3
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit arithmetic shift right routine.
!
.text
.global ___ashrdi3
.type ___ashrdi3,function
___ashrdi3:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
CMP D1Ar3,#0 ! COUNT == 0
MOVEQ PC,D1RtP ! Yes, return
MOV D0Ar4,D1Ar3
SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32
BGE $L20
!! Shift < 32
NEG D1Ar3,D1Ar3 ! N = - N
LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT
LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)
OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP
SWAP D1Ar3,D0Ar4
ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT
MOV PC,D1RtP
$L20:
!! Shift >= 32
ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N
ASR D1Re0,D1Re0,#31 ! HI = HI >> 31
MOV PC,D1RtP
.size ___ashrdi3,.-___ashrdi3
/*
*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Tom May, <ftom@netcom.com>
* Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
* Lots of code moved from tcp.c and ip.c; see those files
* for more names.
*
* 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek:
* Fixed some nasty bugs, causing some horrible crashes.
* A: At some points, the sum (%0) was used as
* length-counter instead of the length counter
* (%1). Thanks to Roman Hodek for pointing this out.
* B: GCC seems to mess up if one uses too many
* data-registers to hold input values and one tries to
* specify d0 and d1 as scratch registers. Letting gcc
* choose these registers itself solves the problem.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access
kills, so most of the assembly has to go. */
#include <linux/module.h>
#include <net/checksum.h>
#include <asm/byteorder.h>
static inline unsigned short from32to16(unsigned int x)
{
/* add up 16-bit and 16-bit for 16+c bit */
x = (x & 0xffff) + (x >> 16);
/* add up carry.. */
x = (x & 0xffff) + (x >> 16);
return x;
}
static unsigned int do_csum(const unsigned char *buff, int len)
{
int odd;
unsigned int result = 0;
if (len <= 0)
goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
#ifdef __LITTLE_ENDIAN
result += (*buff << 8);
#else
result = *buff;
#endif
len--;
buff++;
}
if (len >= 2) {
if (2 & (unsigned long) buff) {
result += *(unsigned short *) buff;
len -= 2;
buff += 2;
}
if (len >= 4) {
const unsigned char *end = buff + ((unsigned)len & ~3);
unsigned int carry = 0;
do {
unsigned int w = *(unsigned int *) buff;
buff += 4;
result += carry;
result += w;
carry = (w > result);
} while (buff < end);
result += carry;
result = (result & 0xffff) + (result >> 16);
}
if (len & 2) {
result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1)
#ifdef __LITTLE_ENDIAN
result += *buff;
#else
result += (*buff << 8);
#endif
result = from32to16(result);
if (odd)
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
return result;
}
EXPORT_SYMBOL(ip_fast_csum);
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum wsum)
{
unsigned int sum = (__force unsigned int)wsum;
unsigned int result = do_csum(buff, len);
/* add in old sum, and carry.. */
result += sum;
if (sum > result)
result += 1;
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return (__force __sum16)~do_csum(buff, len);
}
EXPORT_SYMBOL(ip_compute_csum);
/*
* copy from fs while checksumming, otherwise like csum_partial
*/
__wsum
csum_partial_copy_from_user(const void __user *src, void *dst, int len,
__wsum sum, int *csum_err)
{
int missing;
missing = __copy_from_user(dst, src, len);
if (missing) {
memset(dst + len - missing, 0, missing);
*csum_err = -EFAULT;
} else
*csum_err = 0;
return csum_partial(dst, len, sum);
}
EXPORT_SYMBOL(csum_partial_copy_from_user);
/*
* copy from ds while checksumming, otherwise like csum_partial
*/
__wsum
csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
{
memcpy(dst, src, len);
return csum_partial(dst, len, sum);
}
EXPORT_SYMBOL(csum_partial_copy);
! Copyright 2007,2008,2009 Imagination Technologies Ltd.
#include <asm/page.h>
.text
.global _clear_page
.type _clear_page,function
!! D1Ar1 - page
_clear_page:
MOV TXRPT,#((PAGE_SIZE / 8) - 1)
MOV D0Re0,#0
MOV D1Re0,#0
$Lclear_page_loop:
SETL [D1Ar1++],D0Re0,D1Re0
BR $Lclear_page_loop
MOV PC,D1RtP
.size _clear_page,.-_clear_page
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit signed compare routine.
!
.text
.global ___cmpdi2
.type ___cmpdi2,function
! low high
! s64 a (D0Ar2, D1Ar1)
! s64 b (D0Ar4, D1Ar3)
___cmpdi2:
! start at 1 (equal) and conditionally increment or decrement
MOV D0Re0,#1
! high words differ?
CMP D1Ar1,D1Ar3
BNE $Lhigh_differ
! unsigned compare low words
CMP D0Ar2,D0Ar4
SUBLO D0Re0,D0Re0,#1
ADDHI D0Re0,D0Re0,#1
MOV PC,D1RtP
$Lhigh_differ:
! signed compare high words
SUBLT D0Re0,D0Re0,#1
ADDGT D0Re0,D0Re0,#1
MOV PC,D1RtP
.size ___cmpdi2,.-___cmpdi2
! Copyright 2007,2008 Imagination Technologies Ltd.
#include <asm/page.h>
.text
.global _copy_page
.type _copy_page,function
!! D1Ar1 - to
!! D0Ar2 - from
_copy_page:
MOV D0FrT,#PAGE_SIZE
$Lcopy_page_loop:
GETL D0Re0,D1Re0,[D0Ar2++]
GETL D0Ar6,D1Ar5,[D0Ar2++]
SETL [D1Ar1++],D0Re0,D1Re0
SETL [D1Ar1++],D0Ar6,D1Ar5
SUBS D0FrT,D0FrT,#16
BNZ $Lcopy_page_loop
MOV PC,D1RtP
.size _copy_page,.-_copy_page
/*
* Precise Delay Loops for Meta
*
* Copyright (C) 1993 Linus Torvalds
* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
* Copyright (C) 2007,2009 Imagination Technologies Ltd.
*
*/
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <asm/core_reg.h>
#include <asm/processor.h>
/*
* TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap
* many times per-second. If it does wrap __delay will return prematurely,
* but this is only likely with large delay values.
*
* We also can't implement read_current_timer() with TXTACTCYC due to
* this wrapping behaviour.
*/
#define rdtimer(t) t = __core_reg_get(TXTACTCYC)
void __delay(unsigned long loops)
{
unsigned long bclock, now;
rdtimer(bclock);
do {
asm("NOP");
rdtimer(now);
} while ((now-bclock) < loops);
}
EXPORT_SYMBOL(__delay);
inline void __const_udelay(unsigned long xloops)
{
u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ;
__delay(loops >> 32);
}
EXPORT_SYMBOL(__const_udelay);
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);
! Copyright (C) 2012 Imagination Technologies Ltd.
!
! Signed/unsigned 64-bit division routines.
!
.text
.global _div_u64
.type _div_u64,function
_div_u64:
$L1:
ORS A0.3,D1Ar3,D0Ar4
BNE $L3
$L2:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
MOV PC,D1RtP
$L3:
CMP D1Ar3,D1Ar1
CMPEQ D0Ar4,D0Ar2
MOV D0Re0,#1
MOV D1Re0,#0
BHS $L6
$L4:
ADDS D0Ar6,D0Ar4,D0Ar4
ADD D1Ar5,D1Ar3,D1Ar3
ADDCS D1Ar5,D1Ar5,#1
CMP D1Ar5,D1Ar3
CMPEQ D0Ar6,D0Ar4
BLO $L6
$L5:
MOV D0Ar4,D0Ar6
MOV D1Ar3,D1Ar5
ADDS D0Re0,D0Re0,D0Re0
ADD D1Re0,D1Re0,D1Re0
ADDCS D1Re0,D1Re0,#1
CMP D1Ar3,D1Ar1
CMPEQ D0Ar4,D0Ar2
BLO $L4
$L6:
ORS A0.3,D1Re0,D0Re0
MOV D0Ar6,#0
MOV D1Ar5,D0Ar6
BEQ $L10
$L7:
CMP D1Ar1,D1Ar3
CMPEQ D0Ar2,D0Ar4
BLO $L9
$L8:
ADDS D0Ar6,D0Ar6,D0Re0
ADD D1Ar5,D1Ar5,D1Re0
ADDCS D1Ar5,D1Ar5,#1
SUBS D0Ar2,D0Ar2,D0Ar4
SUB D1Ar1,D1Ar1,D1Ar3
SUBCS D1Ar1,D1Ar1,#1
$L9:
LSL A0.3,D1Re0,#31
LSR D0Re0,D0Re0,#1
LSR D1Re0,D1Re0,#1
OR D0Re0,D0Re0,A0.3
LSL A0.3,D1Ar3,#31
LSR D0Ar4,D0Ar4,#1
LSR D1Ar3,D1Ar3,#1
OR D0Ar4,D0Ar4,A0.3
ORS A0.3,D1Re0,D0Re0
BNE $L7
$L10:
MOV D0Re0,D0Ar6
MOV D1Re0,D1Ar5
MOV PC,D1RtP
.size _div_u64,.-_div_u64
.text
.global _div_s64
.type _div_s64,function
_div_s64:
MSETL [A0StP],D0FrT,D0.5
XOR D0.5,D0Ar2,D0Ar4
XOR D1.5,D1Ar1,D1Ar3
TSTT D1Ar1,#HI(0x80000000)
BZ $L25
NEGS D0Ar2,D0Ar2
NEG D1Ar1,D1Ar1
SUBCS D1Ar1,D1Ar1,#1
$L25:
TSTT D1Ar3,#HI(0x80000000)
BZ $L27
NEGS D0Ar4,D0Ar4
NEG D1Ar3,D1Ar3
SUBCS D1Ar3,D1Ar3,#1
$L27:
CALLR D1RtP,_div_u64
TSTT D1.5,#HI(0x80000000)
BZ $L29
NEGS D0Re0,D0Re0
NEG D1Re0,D1Re0
SUBCS D1Re0,D1Re0,#1
$L29:
GETL D0FrT,D1RtP,[A0StP+#(-16)]
GETL D0.5,D1.5,[A0StP+#(-8)]
SUB A0StP,A0StP,#16
MOV PC,D1RtP
.size _div_s64,.-_div_s64
! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
! Imagination Technologies Ltd
!
! Integer divide routines.
!
.text
.global ___udivsi3
.type ___udivsi3,function
.align 2
___udivsi3:
!!
!! Since core is signed divide case, just set control variable
!!
MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re0
MOV D0Re0,#0 ! Result is 0
MOV D0Ar4,#0 ! Return positive result
B $LIDMCUStart
.size ___udivsi3,.-___udivsi3
!!
!! 32-bit division signed i/p - passed signed 32-bit numbers
!!
.global ___divsi3
.type ___divsi3,function
.align 2
___divsi3:
!!
!! A already in D1Ar1, B already in D0Ar2 -> make B abs(B)
!!
MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re0
MOV D0Re0,#0 ! Result is 0
XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive
ABS D1Ar1,D1Ar1 ! abs(A) -> Au
ABS D1Re0,D1Re0 ! abs(B) -> Bu
$LIDMCUStart:
CMP D1Ar1,D1Re0 ! Is ( Au > Bu )?
LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 2
CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )?
LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 1
BLS $LIDMCUSetup ! Yes: Do normal divide
!!
!! Quick divide setup can assume that CurBit only needs to start at 2
!!
$LIDMCQuick:
CMP D1Ar1,D1Ar3 ! ( A >= Buq )?
ADDCC D0Re0,D0Re0,#2 ! If yes result += 2
SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq
CMP D1Ar1,D1Re0 ! ( A >= Bu )?
ADDCC D0Re0,D0Re0,#1 ! If yes result += 1
SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu
ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?
NEG D0Ar2,D0Re0 ! Calulate neg result
MOVMI D0Re0,D0Ar2 ! Yes: Take neg result
$LIDMCRet:
MOV PC,D1RtP
!!
!! Setup for general unsigned divide code
!!
!! D0Re0 is used to form the result, already set to Zero
!! D1Re0 is the input Bu value, this gets trashed
!! D0Ar6 is curbit which is set to 1 at the start and shifted up
!! D0Ar4 is negative if we should return a negative result
!! D1Ar1 is the input Au value, eventually this holds the remainder
!!
$LIDMCUSetup:
CMP D1Ar1,D1Re0 ! Is ( Au < Bu )?
MOV D0Ar6,#1 ! Set curbit to 1
BCS $LIDMCRet ! Yes: Return 0 remainder Au
!!
!! Calculate alignment using FFB instruction
!!
FFB D1Ar5,D1Ar1 ! Find first bit of Au
ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case.
ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 31
FFB D1Ar3,D1Re0 ! Find first bit of Bu
ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case.
ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 31
SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB
MOV D0Ar2,D1Ar3 ! copy into bank 0
LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B
LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit
!!
!! Now we start the divide proper, logic is
!!
!! if ( A >= B ) add curbit to result and subtract B from A
!! shift curbit and B down by 1 in either case
!!
$LIDMCLoop:
CMP D1Ar1, D1Re0 ! ( A >= B )?
ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit
SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B
LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero?
LSR D1Re0, D1Re0, #1 ! Shift down B
BNZ $LIDMCLoop ! Was single bit in curbit lost?
ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?
NEG D0Ar2,D0Re0 ! Calulate neg result
MOVMI D0Re0,D0Ar2 ! Yes: Take neg result
MOV PC,D1RtP
.size ___divsi3,.-___divsi3
.text
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*
* extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
*
*/
.global _ip_fast_csum
.type _ip_fast_csum,function
_ip_fast_csum:
!! TXRPT needs loops - 1
SUBS TXRPT,D0Ar2,#1
MOV D0Re0,#0
BLO $Lfast_csum_exit
$Lfast_csum_loop:
GETD D1Ar3,[D1Ar1++]
ADDS D0Re0,D0Re0,D1Ar3
ADDCS D0Re0,D0Re0,#1
BR $Lfast_csum_loop
LSR D0Ar4,D0Re0,#16
AND D0Re0,D0Re0,#0xffff
AND D0Ar4,D0Ar4,#0xffff
ADD D0Re0,D0Re0,D0Ar4
LSR D0Ar4,D0Re0,#16
ADD D0Re0,D0Re0,D0Ar4
XOR D0Re0,D0Re0,#-1
AND D0Re0,D0Re0,#0xffff
$Lfast_csum_exit:
MOV PC,D1RtP
.size _ip_fast_csum,.-_ip_fast_csum
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit logical shift right routine.
!
.text
.global ___lshrdi3
.type ___lshrdi3,function
___lshrdi3:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
CMP D1Ar3,#0 ! COUNT == 0
MOVEQ PC,D1RtP ! Yes, return
MOV D0Ar4,D1Ar3
SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32
BGE $L30
!! Shift < 32
NEG D1Ar3,D1Ar3 ! N = - N
LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT
LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)
OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP
SWAP D1Ar3,D0Ar4
LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT
MOV PC,D1RtP
$L30:
!! Shift >= 32
LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N
MOV D1Re0,#0 ! HI = 0
MOV PC,D1RtP
.size ___lshrdi3,.-___lshrdi3
! Copyright (C) 2008-2012 Imagination Technologies Ltd.
.text
.global _memcpy
.type _memcpy,function
! D1Ar1 dst
! D0Ar2 src
! D1Ar3 cnt
! D0Re0 dst
_memcpy:
CMP D1Ar3, #16
MOV A1.2, D0Ar2 ! source pointer
MOV A0.2, D1Ar1 ! destination pointer
MOV A0.3, D1Ar1 ! for return value
! If there are less than 16 bytes to copy use the byte copy loop
BGE $Llong_copy
$Lbyte_copy:
! Simply copy a byte at a time
SUBS TXRPT, D1Ar3, #1
BLT $Lend
$Lloop_byte:
GETB D1Re0, [A1.2++]
SETB [A0.2++], D1Re0
BR $Lloop_byte
$Lend:
! Finally set return value and return
MOV D0Re0, A0.3
MOV PC, D1RtP
$Llong_copy:
ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
BZ $Laligned_dst
! The destination address is not 8 byte aligned. We will copy bytes from
! the source to the destination until the remaining data has an 8 byte
! destination address alignment (i.e we should never copy more than 7
! bytes here).
$Lalign_dst:
GETB D0Re0, [A1.2++]
ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
SETB [A0.2++], D0Re0
CMP D1Ar5, #8
BNE $Lalign_dst
! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
! blocks, then jump to the unaligned copy loop or fall through to the aligned
! copy loop as appropriate.
$Laligned_dst:
MOV D0Ar4, A1.2
LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
ANDS D0Ar4, D0Ar4, #7 ! test source alignment
BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
! Both source and destination are 8 byte aligned - the easy case.
$Laligned_copy:
LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
BZ $Lbyte_copy
SUB TXRPT, D1Ar5, #1
$Laligned_32:
GETL D0Re0, D1Re0, [A1.2++]
GETL D0Ar6, D1Ar5, [A1.2++]
SETL [A0.2++], D0Re0, D1Re0
SETL [A0.2++], D0Ar6, D1Ar5
GETL D0Re0, D1Re0, [A1.2++]
GETL D0Ar6, D1Ar5, [A1.2++]
SETL [A0.2++], D0Re0, D1Re0
SETL [A0.2++], D0Ar6, D1Ar5
BR $Laligned_32
! If there are any remaining bytes use the byte copy loop, otherwise we are done
ANDS D1Ar3, D1Ar3, #0x1f
BNZ $Lbyte_copy
B $Lend
! The destination is 8 byte aligned but the source is not, and there are 8
! or more bytes to be copied.
$Lunaligned_copy:
! Adjust the source pointer (A1.2) to the 8 byte boundary before its
! current value
MOV D0Ar4, A1.2
MOV D0Ar6, A1.2
ANDMB D0Ar4, D0Ar4, #0xfff8
MOV A1.2, D0Ar4
! Save the number of bytes of mis-alignment in D0Ar4 for use later
SUBS D0Ar6, D0Ar6, D0Ar4
MOV D0Ar4, D0Ar6
! if there is no mis-alignment after all, use the aligned copy loop
BZ $Laligned_copy
! prefetch 8 bytes
GETL D0Re0, D1Re0, [A1.2]
SUB TXRPT, D1Ar5, #1
! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
! 4 bytes, and more than 4 bytes.
CMP D0Ar6, #4
BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
! The mis-alignment is more than 4 bytes
$Lunaligned_5_6_7:
SUB D0Ar6, D0Ar6, #4
! Calculate the bit offsets required for the shift operations necesssary
! to align the data.
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
MULW D0Ar6, D0Ar6, #8
MOV D1Ar5, #32
SUB D1Ar5, D1Ar5, D0Ar6
! Move data 4 bytes before we enter the main loop
MOV D0Re0, D1Re0
$Lloop_5_6_7:
GETL D0Ar2, D1Ar1, [++A1.2]
! form 64-bit data in D0Re0, D1Re0
LSR D0Re0, D0Re0, D0Ar6
MOV D1Re0, D0Ar2
LSL D1Re0, D1Re0, D1Ar5
ADD D0Re0, D0Re0, D1Re0
LSR D0Ar2, D0Ar2, D0Ar6
LSL D1Re0, D1Ar1, D1Ar5
ADD D1Re0, D1Re0, D0Ar2
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D1Ar1
BR $Lloop_5_6_7
B $Lunaligned_end
$Lunaligned_1_2_3:
! Calculate the bit offsets required for the shift operations necesssary
! to align the data.
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
MULW D0Ar6, D0Ar6, #8
MOV D1Ar5, #32
SUB D1Ar5, D1Ar5, D0Ar6
$Lloop_1_2_3:
! form 64-bit data in D0Re0,D1Re0
LSR D0Re0, D0Re0, D0Ar6
LSL D1Ar1, D1Re0, D1Ar5
ADD D0Re0, D0Re0, D1Ar1
MOV D0Ar2, D1Re0
LSR D0FrT, D0Ar2, D0Ar6
GETL D0Ar2, D1Ar1, [++A1.2]
MOV D1Re0, D0Ar2
LSL D1Re0, D1Re0, D1Ar5
ADD D1Re0, D1Re0, D0FrT
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D0Ar2
MOV D1Re0, D1Ar1
BR $Lloop_1_2_3
B $Lunaligned_end
! The 4 byte mis-alignment case - this does not require any shifting, just a
! shuffling of registers.
$Lunaligned_4:
MOV D0Re0, D1Re0
$Lloop_4:
GETL D0Ar2, D1Ar1, [++A1.2]
MOV D1Re0, D0Ar2
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D1Ar1
BR $Lloop_4
$Lunaligned_end:
! If there are no remaining bytes to copy, we are done.
ANDS D1Ar3, D1Ar3, #7
BZ $Lend
! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
! address of the remaining bytes, and fall through to the byte copy loop.
MOV D0Ar6, A1.2
ADD D1Ar5, D0Ar4, D0Ar6
MOV A1.2, D1Ar5
B $Lbyte_copy
.size _memcpy,.-_memcpy
! Copyright (C) 2008-2012 Imagination Technologies Ltd.
.text
.global _memmove
.type _memmove,function
! D1Ar1 dst
! D0Ar2 src
! D1Ar3 cnt
! D0Re0 dst
_memmove:
CMP D1Ar3, #0
MOV D0Re0, D1Ar1
BZ $LEND2
MSETL [A0StP], D0.5, D0.6, D0.7
MOV D1Ar5, D0Ar2
CMP D1Ar1, D1Ar5
BLT $Lforwards_copy
SUB D0Ar4, D1Ar1, D1Ar3
ADD D0Ar4, D0Ar4, #1
CMP D0Ar2, D0Ar4
BLT $Lforwards_copy
! should copy backwards
MOV D1Re0, D0Ar2
! adjust pointer to the end of mem
ADD D0Ar2, D1Re0, D1Ar3
ADD D1Ar1, D1Ar1, D1Ar3
MOV A1.2, D0Ar2
MOV A0.2, D1Ar1
CMP D1Ar3, #8
BLT $Lbbyte_loop
MOV D0Ar4, D0Ar2
MOV D1Ar5, D1Ar1
! test 8 byte alignment
ANDS D1Ar5, D1Ar5, #7
BNE $Lbdest_unaligned
ANDS D0Ar4, D0Ar4, #7
BNE $Lbsrc_unaligned
LSR D1Ar5, D1Ar3, #3
$Lbaligned_loop:
GETL D0Re0, D1Re0, [--A1.2]
SETL [--A0.2], D0Re0, D1Re0
SUBS D1Ar5, D1Ar5, #1
BNE $Lbaligned_loop
ANDS D1Ar3, D1Ar3, #7
BZ $Lbbyte_loop_exit
$Lbbyte_loop:
GETB D1Re0, [--A1.2]
SETB [--A0.2], D1Re0
SUBS D1Ar3, D1Ar3, #1
BNE $Lbbyte_loop
$Lbbyte_loop_exit:
MOV D0Re0, A0.2
$LEND:
SUB A0.2, A0StP, #24
MGETL D0.5, D0.6, D0.7, [A0.2]
SUB A0StP, A0StP, #24
$LEND2:
MOV PC, D1RtP
$Lbdest_unaligned:
GETB D0Re0, [--A1.2]
SETB [--A0.2], D0Re0
SUBS D1Ar5, D1Ar5, #1
SUB D1Ar3, D1Ar3, #1
BNE $Lbdest_unaligned
CMP D1Ar3, #8
BLT $Lbbyte_loop
$Lbsrc_unaligned:
LSR D1Ar5, D1Ar3, #3
! adjust A1.2
MOV D0Ar4, A1.2
! save original address
MOV D0Ar6, A1.2
ADD D0Ar4, D0Ar4, #7
ANDMB D0Ar4, D0Ar4, #0xfff8
! new address is the 8-byte aligned one above the original
MOV A1.2, D0Ar4
! A0.2 dst 64-bit is aligned
! measure the gap size
SUB D0Ar6, D0Ar4, D0Ar6
MOVS D0Ar4, D0Ar6
! keep this information for the later adjustment
! both aligned
BZ $Lbaligned_loop
! prefetch
GETL D0Re0, D1Re0, [--A1.2]
CMP D0Ar6, #4
BLT $Lbunaligned_1_2_3
! 32-bit aligned
BZ $Lbaligned_4
SUB D0Ar6, D0Ar6, #4
! D1.6 stores the gap size in bits
MULW D1.6, D0Ar6, #8
MOV D0.6, #32
! D0.6 stores the complement of the gap size
SUB D0.6, D0.6, D1.6
$Lbunaligned_5_6_7:
GETL D0.7, D1.7, [--A1.2]
! form 64-bit data in D0Re0, D1Re0
MOV D1Re0, D0Re0
! D1Re0 << gap-size
LSL D1Re0, D1Re0, D1.6
MOV D0Re0, D1.7
! D0Re0 >> complement
LSR D0Re0, D0Re0, D0.6
MOV D1.5, D0Re0
! combine the both
ADD D1Re0, D1Re0, D1.5
MOV D1.5, D1.7
LSL D1.5, D1.5, D1.6
MOV D0Re0, D0.7
LSR D0Re0, D0Re0, D0.6
MOV D0.5, D1.5
ADD D0Re0, D0Re0, D0.5
SETL [--A0.2], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lbunaligned_5_6_7
ANDS D1Ar3, D1Ar3, #7
BZ $Lbbyte_loop_exit
! Adjust A1.2
! A1.2 <- A1.2 +8 - gapsize
ADD A1.2, A1.2, #8
SUB A1.2, A1.2, D0Ar4
B $Lbbyte_loop
$Lbunaligned_1_2_3:
MULW D1.6, D0Ar6, #8
MOV D0.6, #32
SUB D0.6, D0.6, D1.6
$Lbunaligned_1_2_3_loop:
GETL D0.7, D1.7, [--A1.2]
! form 64-bit data in D0Re0, D1Re0
LSL D1Re0, D1Re0, D1.6
! save D0Re0 for later use
MOV D0.5, D0Re0
LSR D0Re0, D0Re0, D0.6
MOV D1.5, D0Re0
ADD D1Re0, D1Re0, D1.5
! orignal data in D0Re0
MOV D1.5, D0.5
LSL D1.5, D1.5, D1.6
MOV D0Re0, D1.7
LSR D0Re0, D0Re0, D0.6
MOV D0.5, D1.5
ADD D0Re0, D0Re0, D0.5
SETL [--A0.2], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lbunaligned_1_2_3_loop
ANDS D1Ar3, D1Ar3, #7
BZ $Lbbyte_loop_exit
! Adjust A1.2
ADD A1.2, A1.2, #8
SUB A1.2, A1.2, D0Ar4
B $Lbbyte_loop
$Lbaligned_4:
GETL D0.7, D1.7, [--A1.2]
MOV D1Re0, D0Re0
MOV D0Re0, D1.7
SETL [--A0.2], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lbaligned_4
ANDS D1Ar3, D1Ar3, #7
BZ $Lbbyte_loop_exit
! Adjust A1.2
ADD A1.2, A1.2, #8
SUB A1.2, A1.2, D0Ar4
B $Lbbyte_loop
$Lforwards_copy:
MOV A1.2, D0Ar2
MOV A0.2, D1Ar1
CMP D1Ar3, #8
BLT $Lfbyte_loop
MOV D0Ar4, D0Ar2
MOV D1Ar5, D1Ar1
ANDS D1Ar5, D1Ar5, #7
BNE $Lfdest_unaligned
ANDS D0Ar4, D0Ar4, #7
BNE $Lfsrc_unaligned
LSR D1Ar5, D1Ar3, #3
$Lfaligned_loop:
GETL D0Re0, D1Re0, [A1.2++]
SUBS D1Ar5, D1Ar5, #1
SETL [A0.2++], D0Re0, D1Re0
BNE $Lfaligned_loop
ANDS D1Ar3, D1Ar3, #7
BZ $Lfbyte_loop_exit
$Lfbyte_loop:
GETB D1Re0, [A1.2++]
SETB [A0.2++], D1Re0
SUBS D1Ar3, D1Ar3, #1
BNE $Lfbyte_loop
$Lfbyte_loop_exit:
MOV D0Re0, D1Ar1
B $LEND
$Lfdest_unaligned:
GETB D0Re0, [A1.2++]
ADD D1Ar5, D1Ar5, #1
SUB D1Ar3, D1Ar3, #1
SETB [A0.2++], D0Re0
CMP D1Ar5, #8
BNE $Lfdest_unaligned
CMP D1Ar3, #8
BLT $Lfbyte_loop
$Lfsrc_unaligned:
! adjust A1.2
LSR D1Ar5, D1Ar3, #3
MOV D0Ar4, A1.2
MOV D0Ar6, A1.2
ANDMB D0Ar4, D0Ar4, #0xfff8
MOV A1.2, D0Ar4
! A0.2 dst 64-bit is aligned
SUB D0Ar6, D0Ar6, D0Ar4
! keep the information for the later adjustment
MOVS D0Ar4, D0Ar6
! both aligned
BZ $Lfaligned_loop
! prefetch
GETL D0Re0, D1Re0, [A1.2]
CMP D0Ar6, #4
BLT $Lfunaligned_1_2_3
BZ $Lfaligned_4
SUB D0Ar6, D0Ar6, #4
MULW D0.6, D0Ar6, #8
MOV D1.6, #32
SUB D1.6, D1.6, D0.6
$Lfunaligned_5_6_7:
GETL D0.7, D1.7, [++A1.2]
! form 64-bit data in D0Re0, D1Re0
MOV D0Re0, D1Re0
LSR D0Re0, D0Re0, D0.6
MOV D1Re0, D0.7
LSL D1Re0, D1Re0, D1.6
MOV D0.5, D1Re0
ADD D0Re0, D0Re0, D0.5
MOV D0.5, D0.7
LSR D0.5, D0.5, D0.6
MOV D1Re0, D1.7
LSL D1Re0, D1Re0, D1.6
MOV D1.5, D0.5
ADD D1Re0, D1Re0, D1.5
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lfunaligned_5_6_7
ANDS D1Ar3, D1Ar3, #7
BZ $Lfbyte_loop_exit
! Adjust A1.2
ADD A1.2, A1.2, D0Ar4
B $Lfbyte_loop
$Lfunaligned_1_2_3:
MULW D0.6, D0Ar6, #8
MOV D1.6, #32
SUB D1.6, D1.6, D0.6
$Lfunaligned_1_2_3_loop:
GETL D0.7, D1.7, [++A1.2]
! form 64-bit data in D0Re0, D1Re0
LSR D0Re0, D0Re0, D0.6
MOV D1.5, D1Re0
LSL D1Re0, D1Re0, D1.6
MOV D0.5, D1Re0
ADD D0Re0, D0Re0, D0.5
MOV D0.5, D1.5
LSR D0.5, D0.5, D0.6
MOV D1Re0, D0.7
LSL D1Re0, D1Re0, D1.6
MOV D1.5, D0.5
ADD D1Re0, D1Re0, D1.5
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lfunaligned_1_2_3_loop
ANDS D1Ar3, D1Ar3, #7
BZ $Lfbyte_loop_exit
! Adjust A1.2
ADD A1.2, A1.2, D0Ar4
B $Lfbyte_loop
$Lfaligned_4:
GETL D0.7, D1.7, [++A1.2]
MOV D0Re0, D1Re0
MOV D1Re0, D0.7
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D0.7
MOV D1Re0, D1.7
SUBS D1Ar5, D1Ar5, #1
BNE $Lfaligned_4
ANDS D1Ar3, D1Ar3, #7
BZ $Lfbyte_loop_exit
! Adjust A1.2
ADD A1.2, A1.2, D0Ar4
B $Lfbyte_loop
.size _memmove,.-_memmove
! Copyright (C) 2008-2012 Imagination Technologies Ltd.
.text
.global _memset
.type _memset,function
! D1Ar1 dst
! D0Ar2 c
! D1Ar3 cnt
! D0Re0 dst
_memset:
AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value
MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-15
ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst
LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-31
ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2)
MOV D0Re0,D1Ar1 ! Return dst
BZ $LLongStub ! if start address is aligned
! start address is not aligned on an 8 byte boundary, so we
! need the number of bytes up to the next 8 byte address
! boundary, or the length of the string if less than 8, in D1Ar5
MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ...
SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N
CMP D1Ar3,D1Ar5
MOVMI D1Ar5,D1Ar3
B $LByteStub ! dst is mis-aligned, do $LByteStub
!
! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles)
!
$LLongStub:
LSRS D0Ar2,D1Ar3,#5
AND D1Ar3,D1Ar3,#0x1F
MOV A1.2,A0.2
BEQ $LLongishStub
SUB TXRPT,D0Ar2,#1
CMP D1Ar3,#0
$LLongLoop:
SETL [D1Ar1++],A0.2,A1.2
SETL [D1Ar1++],A0.2,A1.2
SETL [D1Ar1++],A0.2,A1.2
SETL [D1Ar1++],A0.2,A1.2
BR $LLongLoop
BZ $Lexit
!
! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles)
!
$LLongishStub:
LSRS D0Ar2,D1Ar3,#3
AND D1Ar3,D1Ar3,#0x7
MOV D1Ar5,D1Ar3
BEQ $LByteStub
SUB TXRPT,D0Ar2,#1
CMP D1Ar3,#0
$LLongishLoop:
SETL [D1Ar1++],A0.2,A1.2
BR $LLongishLoop
BZ $Lexit
!
! This does a byte structured burst of up to 7 bytes
!
! D1Ar1 should point to the location required
! D1Ar3 should be the remaining total byte count
! D1Ar5 should be burst size (<= D1Ar3)
!
$LByteStub:
SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count
ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area
MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4)
SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ...
MOV A1.2,D1Ar5
SUB PC,CPC1,A1.2 ! Jump into table below
SETB [D1Ar1+#(-7)],A0.2
SETB [D1Ar1+#(-6)],A0.2
SETB [D1Ar1+#(-5)],A0.2
SETB [D1Ar1+#(-4)],A0.2
SETB [D1Ar1+#(-3)],A0.2
SETB [D1Ar1+#(-2)],A0.2
SETB [D1Ar1+#(-1)],A0.2
!
! Return if all data has been output, otherwise do $LLongStub
!
BNZ $LLongStub
$Lexit:
MOV PC,D1RtP
.size _memset,.-_memset
! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
! Imagination Technologies Ltd
!
! Integer modulus routines.
!
!!
!! 32-bit modulus unsigned i/p - passed unsigned 32-bit numbers
!!
.text
.global ___umodsi3
.type ___umodsi3,function
.align 2
___umodsi3:
MOV D0FrT,D1RtP ! Save original return address
CALLR D1RtP,___udivsi3
MOV D1RtP,D0FrT ! Recover return address
MOV D0Re0,D1Ar1 ! Return remainder
MOV PC,D1RtP
.size ___umodsi3,.-___umodsi3
!!
!! 32-bit modulus signed i/p - passed signed 32-bit numbers
!!
.global ___modsi3
.type ___modsi3,function
.align 2
___modsi3:
MOV D0FrT,D1RtP ! Save original return address
MOV A0.2,D1Ar1 ! Save A in A0.2
CALLR D1RtP,___divsi3
MOV D1RtP,D0FrT ! Recover return address
MOV D1Re0,A0.2 ! Recover A
MOV D0Re0,D1Ar1 ! Return remainder
ORS D1Re0,D1Re0,D1Re0 ! Was A negative?
NEG D1Ar1,D1Ar1 ! Negate remainder
MOVMI D0Re0,D1Ar1 ! Return neg remainder
MOV PC, D1RtP
.size ___modsi3,.-___modsi3
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit multiply routine.
!
!
! 64-bit signed/unsigned multiply
!
! A = D1Ar1:D0Ar2 = a 2^48 + b 2^32 + c 2^16 + d 2^0
!
! B = D1Ar3:D0Ar4 = w 2^48 + x 2^32 + y 2^16 + z 2^0
!
.text
.global ___muldi3
.type ___muldi3,function
___muldi3:
MULD D1Re0,D1Ar1,D0Ar4 ! (a 2^48 + b 2^32)(y 2^16 + z 2^0)
MULD D0Re0,D0Ar2,D1Ar3 ! (w 2^48 + x 2^32)(c 2^16 + d 2^0)
ADD D1Re0,D1Re0,D0Re0
MULW D0Re0,D0Ar2,D0Ar4 ! (d 2^0) * (z 2^0)
RTDW D0Ar2,D0Ar2
MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(z 2^0)
LSR D1Ar5,D0Ar6,#16
LSL D0Ar6,D0Ar6,#16
ADDS D0Re0,D0Re0,D0Ar6
ADDCS D1Re0,D1Re0,#1
RTDW D0Ar4,D0Ar4
ADD D1Re0,D1Re0,D1Ar5
MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(y 2^16)
ADD D1Re0,D1Re0,D0Ar6
RTDW D0Ar2,D0Ar2
MULW D0Ar6,D0Ar2,D0Ar4 ! (d 2^0)(y 2^16)
LSR D1Ar5,D0Ar6,#16
LSL D0Ar6,D0Ar6,#16
ADDS D0Re0,D0Re0,D0Ar6
ADD D1Re0,D1Re0,D1Ar5
ADDCS D1Re0,D1Re0,#1
MOV PC, D1RtP
.size ___muldi3,.-___muldi3
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit unsigned compare routine.
!
.text
.global ___ucmpdi2
.type ___ucmpdi2,function
! low high
! u64 a (D0Ar2, D1Ar1)
! u64 b (D0Ar4, D1Ar3)
___ucmpdi2:
! start at 1 (equal) and conditionally increment or decrement
MOV D0Re0,#1
! high words
CMP D1Ar1,D1Ar3
! or if equal, low words
CMPEQ D0Ar2,D0Ar4
! unsigned compare
SUBLO D0Re0,D0Re0,#1
ADDHI D0Re0,D0Re0,#1
MOV PC,D1RtP
.size ___ucmpdi2,.-___ucmpdi2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment