Commit ad889c6b authored by Russell King's avatar Russell King

Fix nwfpe so GDB can debug user space floating point again.

Patch 960/1 (Peter Teichmann):
   NWFPE patch to be more compliant to IEEE-754

1. The RND/URD instruction was handled as int_to_float(float_to_int
   (number)) which is wrong because it only works for floating point
   numbers that fit in an integer.

2. The FLT instruction was setting the rounding precision for
   extended precision calculations, which is not necessary
   (probably a historic relict) but has undesirable side effects
   on all extended precision calculations.
parent a6560a26
2002-01-19 Russell King <rmk@arm.linux.org.uk>
* fpa11.h - Add documentation
- remove userRegisters pointer from this structure.
- add new method to obtain integer register values.
* softfloat.c - Remove float128
* softfloat.h - Remove float128
* softfloat-specialize - Remove float128
* The FPA11 structure is not a kernel-specific data structure.
It is used by users of ptrace to examine the values of the
floating point registers. Therefore, any changes to the
FPA11 structure (size or position of elements contained
within) have to be well thought out.
* Since 128-bit float requires the FPA11 structure to change
size, it has been removed. 128-bit float is currently unused,
and needs various things to be re-worked so that we won't
overflow the available space in the task structure.
* The changes are designed to break any patch that goes on top
of this code, so that the authors properly review their changes.
1999-08-19 Scott Bambrough <scottb@netwinder.org>
* fpmodule.c - Changed version number to 0.95
......
......@@ -19,9 +19,9 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpa11.h"
float64 float64_exp(float64 Fm);
float64 float64_ln(float64 Fm);
......@@ -165,8 +165,7 @@ unsigned int DoubleCPDO(const unsigned int opcode)
case RND_CODE:
case URD_CODE:
fpa11->fpreg[Fd].fDouble =
int32_to_float64(float64_to_int32(rFm));
fpa11->fpreg[Fd].fDouble = float64_round_to_int(rFm);
break;
case SQT_CODE:
......
......@@ -52,8 +52,6 @@ This routine does three things:
1) The kernel has created a struct pt_regs on the stack and saved the
user registers into it. See /usr/include/asm/proc/ptrace.h for details.
The emulator code uses userRegisters as the base of an array of words
from which the contents of the registers can be extracted.
2) It calls EmulateAll to emulate a floating point instruction.
EmulateAll returns 1 if the emulation was successful, or 0 if not.
......@@ -72,14 +70,9 @@ several floating point instructions. */
.globl nwfpe_enter
nwfpe_enter:
/* ?? Could put userRegisters and fpa11 into fixed regs during
emulation. This would reduce load/store overhead at the expense
of stealing two regs from the register allocator. Not sure if
it's worth it. */
str sp, [r10] @ Store the user registers pointer in the fpa11 structure.
mov r4, lr @ save the failure-return addresses
mov sl, sp
mov r0, r10
bl FPA11_CheckInit @ check to see if we are initialised
ldr r5, [sp, #60] @ get contents of PC;
......
......@@ -65,18 +65,10 @@ several floating point instructions. */
.globl nwfpe_enter
nwfpe_enter:
ldr r4, =userRegisters
str sp, [r4] @ save pointer to user regs
mov sl, sp
bl FPA11_CheckInit @ check to see if we are initialised
mov r10, sp, lsr #13 @ find workspace
mov r10, r10, lsl #13
add r10, r10, #TSS_FPESAVE
ldr r4, =fpa11
str r10, [r4] @ store pointer to our state
mov r4, sp @ use r4 for local pointer
ldr r5, [r4, #60] @ get contents of PC
ldr r5, [sp, #60] @ get contents of PC
bic r5, r5, #0xfc000003
ldr r0, [r5, #-4] @ get actual instruction into r0
bl EmulateAll @ emulate the instruction
......@@ -93,10 +85,10 @@ next:
teqne r2, #0x0E000000
bne ret_from_exception @ return ok if not a fp insn
ldr r9, [r4, #60] @ get new condition codes
ldr r9, [sp, #60] @ get new condition codes
and r9, r9, #0xfc000003
orr r7, r5, r9
str r7, [r4, #60] @ update PC copy in regs
str r7, [sp, #60] @ update PC copy in regs
mov r0, r6 @ save a copy
mov r1, r9 @ fetch the condition codes
......
......@@ -19,9 +19,9 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpa11.h"
floatx80 floatx80_exp(floatx80 Fm);
floatx80 floatx80_ln(floatx80 Fm);
......@@ -157,8 +157,7 @@ unsigned int ExtendedCPDO(const unsigned int opcode)
case RND_CODE:
case URD_CODE:
fpa11->fpreg[Fd].fExtended =
int32_to_floatx80(floatx80_to_int32(rFm));
fpa11->fpreg[Fd].fExtended = floatx80_round_to_int(rFm);
break;
case SQT_CODE:
......
......@@ -18,8 +18,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/compiler.h>
#include <asm/system.h>
#include "fpa11.h"
#include "fpopcode.h"
......@@ -27,6 +25,9 @@
#include "fpmodule.h"
#include "fpmodule.inl"
#include <linux/compiler.h>
#include <asm/system.h>
/* forward declarations */
unsigned int EmulateCPDO(const unsigned int);
unsigned int EmulateCPDT(const unsigned int);
......@@ -56,6 +57,7 @@ void resetFPA11(void)
void SetRoundingMode(const unsigned int opcode)
{
#if MAINTAIN_FPCR
FPA11 *fpa11 = GET_FPA11();
fpa11->fpcr &= ~MASK_ROUNDING_MODE;
#endif
switch (opcode & MASK_ROUNDING_MODE)
......@@ -94,6 +96,7 @@ void SetRoundingMode(const unsigned int opcode)
void SetRoundingPrecision(const unsigned int opcode)
{
#if MAINTAIN_FPCR
FPA11 *fpa11 = GET_FPA11();
fpa11->fpcr &= ~MASK_ROUNDING_PRECISION;
#endif
switch (opcode & MASK_ROUNDING_PRECISION)
......@@ -123,8 +126,9 @@ void SetRoundingPrecision(const unsigned int opcode)
}
}
void FPA11_CheckInit(FPA11 *fpa11)
void FPA11_CheckInit(void)
{
FPA11 *fpa11 = GET_FPA11();
if (unlikely(fpa11->initflag == 0))
{
resetFPA11();
......
......@@ -22,45 +22,66 @@
#ifndef __FPA11_H__
#define __FPA11_H__
#define GET_FPA11() ((FPA11 *)(&current_thread_info()->fpstate))
/*
* The processes registers are always at the very top of the 8K
* stack+task struct. Use the same method as 'current' uses to
* reach them.
*/
register unsigned int *user_registers asm("sl");
#define GET_USERREG() (user_registers)
#include <linux/thread_info.h>
/* includes */
#include "fpsr.h" /* FP control and status register definitions */
#include "softfloat.h"
/* Need task_struct */
#include <linux/sched.h>
#define typeNone 0x00
#define typeSingle 0x01
#define typeDouble 0x02
#define typeExtended 0x03
/*
* This must be no more and no less than 12 bytes.
*/
typedef union tagFPREG {
float32 fSingle;
float64 fDouble;
floatx80 fExtended;
float64 fDouble;
float32 fSingle;
} FPREG;
/* FPA11 device model */
/*
* FPA11 device model.
*
* This structure is exported to user space. Do not re-order.
* Only add new stuff to the end, and do not change the size of
* any element. Elements of this structure are used by user
* space, and must match struct user_fp in include/asm-arm/user.h.
* We include the byte offsets below for documentation purposes.
*
* The size of this structure and FPREG are checked by fpmodule.c
* on initialisation. If the rules have been broken, NWFPE will
* not initialise.
*/
typedef struct tagFPA11 {
unsigned int *userRegisters;
FPREG fpreg[8]; /* 8 floating point registers */
FPSR fpsr; /* floating point status register */
FPCR fpcr; /* floating point control register */
unsigned char fType[8]; /* type of floating point value held in
floating point registers. One of none
single, double or extended. */
int initflag; /* this is special. The kernel guarantees
to set it to 0 when a thread is launched,
so we can use it to detect whether this
instance of the emulator needs to be
initialised. */
/* 0 */ FPREG fpreg[8]; /* 8 floating point registers */
/* 96 */ FPSR fpsr; /* floating point status register */
/* 100 */ FPCR fpcr; /* floating point control register */
/* 104 */ unsigned char fType[8]; /* type of floating point value held in
floating point registers. One of none
single, double or extended. */
/* 112 */ int initflag; /* this is special. The kernel guarantees
to set it to 0 when a thread is launched,
so we can use it to detect whether this
instance of the emulator needs to be
initialised. */
} FPA11;
extern void resetFPA11(void);
extern void SetRoundingMode(const unsigned int);
extern void SetRoundingPrecision(const unsigned int);
#define GET_FPA11() ((FPA11 *)(&current->thread.fpstate))
#define GET_USERREG() (GET_FPA11()->userRegisters)
#endif
......@@ -20,9 +20,9 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpa11.h"
#include "fpmodule.h"
#include "fpmodule.inl"
......
......@@ -20,10 +20,10 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "milieu.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpa11.h"
#include "fpa11.inl"
#include "fpmodule.h"
#include "fpmodule.inl"
......@@ -82,8 +82,7 @@ unsigned int PerformFLT(const unsigned int opcode)
unsigned int nRc = 1;
SetRoundingMode(opcode);
SetRoundingPrecision(opcode);
switch (opcode & MASK_ROUNDING_PRECISION)
{
case ROUND_SINGLE:
......
......@@ -21,6 +21,8 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include <linux/module.h>
#include <linux/version.h>
#include <linux/config.h>
......@@ -36,7 +38,6 @@
#include "softfloat.h"
#include "fpopcode.h"
#include "fpmodule.h"
#include "fpa11.h"
#include "fpa11.inl"
/* kernel symbols required for signal handling */
......@@ -86,6 +87,11 @@ static int __init fpe_init(void)
return -EINVAL;
}
if (sizeof(FPREG) != 12) {
printk(KERN_ERR "nwfpe: bad register size\n");
return -EINVAL;
}
#ifdef MODULE
if (!mod_member_present(&__this_module, can_unload))
return -EINVAL;
......
......@@ -19,10 +19,10 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpsr.h"
#include "fpa11.h"
#include "fpmodule.h"
#include "fpmodule.inl"
......
......@@ -19,9 +19,9 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "fpa11.h"
#include "softfloat.h"
#include "fpopcode.h"
#include "fpa11.h"
float32 float32_exp(float32 Fm);
float32 float32_ln(float32 Fm);
......@@ -139,8 +139,7 @@ unsigned int SingleCPDO(const unsigned int opcode)
case RND_CODE:
case URD_CODE:
fpa11->fpreg[Fd].fSingle =
int32_to_float32(float32_to_int32(rFm));
fpa11->fpreg[Fd].fSingle = float32_round_to_int(rFm);
break;
case SQT_CODE:
......
......@@ -364,108 +364,3 @@ static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b )
}
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
The pattern for a default generated quadruple-precision NaN. The `high' and
`low' values hold the most- and least-significant bits, respectively.
-------------------------------------------------------------------------------
*/
#define float128_default_nan_high LIT64( 0xFFFFFFFFFFFFFFFF )
#define float128_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF )
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
otherwise returns 0.
-------------------------------------------------------------------------------
*/
flag float128_is_nan( float128 a )
{
return
( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
&& ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is a
signaling NaN; otherwise returns 0.
-------------------------------------------------------------------------------
*/
flag float128_is_signaling_nan( float128 a )
{
return
( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
&& ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
}
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point NaN
`a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
exception is raised.
-------------------------------------------------------------------------------
*/
static commonNaNT float128ToCommonNaN( float128 a )
{
commonNaNT z;
if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
z.sign = a.high>>63;
shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
return z;
}
/*
-------------------------------------------------------------------------------
Returns the result of converting the canonical NaN `a' to the quadruple-
precision floating-point format.
-------------------------------------------------------------------------------
*/
static float128 commonNaNToFloat128( commonNaNT a )
{
float128 z;
shift128Right( a.high, a.low, 16, &z.high, &z.low );
z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 );
return z;
}
/*
-------------------------------------------------------------------------------
Takes two quadruple-precision floating-point values `a' and `b', one of
which is a NaN, and returns the appropriate NaN result. If either `a' or
`b' is a signaling NaN, the invalid exception is raised.
-------------------------------------------------------------------------------
*/
static float128 propagateFloat128NaN( float128 a, float128 b )
{
flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
aIsNaN = float128_is_nan( a );
aIsSignalingNaN = float128_is_signaling_nan( a );
bIsNaN = float128_is_nan( b );
bIsSignalingNaN = float128_is_signaling_nan( b );
a.high |= LIT64( 0x0000800000000000 );
b.high |= LIT64( 0x0000800000000000 );
if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
if ( aIsNaN ) {
return ( aIsSignalingNaN & bIsNaN ) ? b : a;
}
else {
return b;
}
}
#endif
......@@ -28,6 +28,7 @@ this code that are retained.
===============================================================================
*/
#include "fpa11.h"
#include "milieu.h"
#include "softfloat.h"
......@@ -753,277 +754,6 @@ static floatx80
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the least-significant 64 fraction bits of the quadruple-precision
floating-point value `a'.
-------------------------------------------------------------------------------
*/
INLINE bits64 extractFloat128Frac1( float128 a )
{
return a.low;
}
/*
-------------------------------------------------------------------------------
Returns the most-significant 48 fraction bits of the quadruple-precision
floating-point value `a'.
-------------------------------------------------------------------------------
*/
INLINE bits64 extractFloat128Frac0( float128 a )
{
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
}
/*
-------------------------------------------------------------------------------
Returns the exponent bits of the quadruple-precision floating-point value
`a'.
-------------------------------------------------------------------------------
*/
INLINE int32 extractFloat128Exp( float128 a )
{
return ( a.high>>48 ) & 0x7FFF;
}
/*
-------------------------------------------------------------------------------
Returns the sign bit of the quadruple-precision floating-point value `a'.
-------------------------------------------------------------------------------
*/
INLINE flag extractFloat128Sign( float128 a )
{
return a.high>>63;
}
/*
-------------------------------------------------------------------------------
Normalizes the subnormal quadruple-precision floating-point value
represented by the denormalized significand formed by the concatenation of
`aSig0' and `aSig1'. The normalized exponent is stored at the location
pointed to by `zExpPtr'. The most significant 49 bits of the normalized
significand are stored at the location pointed to by `zSig0Ptr', and the
least significant 64 bits of the normalized significand are stored at the
location pointed to by `zSig1Ptr'.
-------------------------------------------------------------------------------
*/
static void
normalizeFloat128Subnormal(
bits64 aSig0,
bits64 aSig1,
int32 *zExpPtr,
bits64 *zSig0Ptr,
bits64 *zSig1Ptr
)
{
int8 shiftCount;
if ( aSig0 == 0 ) {
shiftCount = countLeadingZeros64( aSig1 ) - 15;
if ( shiftCount < 0 ) {
*zSig0Ptr = aSig1>>( - shiftCount );
*zSig1Ptr = aSig1<<( shiftCount & 63 );
}
else {
*zSig0Ptr = aSig1<<shiftCount;
*zSig1Ptr = 0;
}
*zExpPtr = - shiftCount - 63;
}
else {
shiftCount = countLeadingZeros64( aSig0 ) - 15;
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
*zExpPtr = 1 - shiftCount;
}
}
/*
-------------------------------------------------------------------------------
Packs the sign `zSign', the exponent `zExp', and the significand formed
by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
floating-point value, returning the result. After being shifted into the
proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
added together to form the most significant 32 bits of the result. This
means that any integer portion of `zSig0' will be added into the exponent.
Since a properly normalized significand will have an integer portion equal
to 1, the `zExp' input should be 1 less than the desired result exponent
whenever `zSig0' and `zSig1' concatenated form a complete, normalized
significand.
-------------------------------------------------------------------------------
*/
INLINE float128
packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
{
float128 z;
z.low = zSig1;
z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
return z;
}
/*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and extended significand formed by the concatenation of `zSig0', `zSig1',
and `zSig2', and returns the proper quadruple-precision floating-point value
corresponding to the abstract input. Ordinarily, the abstract value is
simply rounded and packed into the quadruple-precision format, with the
inexact exception raised if the abstract input cannot be represented
exactly. If the abstract value is too large, however, the overflow and
inexact exceptions are raised and an infinity or maximal finite value is
returned. If the abstract value is too small, the input value is rounded to
a subnormal number, and the underflow and inexact exceptions are raised if
the abstract input cannot be represented exactly as a subnormal quadruple-
precision floating-point number.
The input significand must be normalized or smaller. If the input
significand is not normalized, `zExp' must be 0; in that case, the result
returned is a subnormal number, and it must not require rounding. In the
usual case that the input significand is normalized, `zExp' must be 1 less
than the ``true'' floating-point exponent. The handling of underflow and
overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
static float128
roundAndPackFloat128(
flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
{
int8 roundingMode;
flag roundNearestEven, increment, isTiny;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
increment = ( (sbits64) zSig2 < 0 );
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
increment = 0;
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig2;
}
else {
increment = ( roundingMode == float_round_up ) && zSig2;
}
}
}
if ( 0x7FFD <= (bits32) zExp ) {
if ( ( 0x7FFD < zExp )
|| ( ( zExp == 0x7FFD )
&& eq128(
LIT64( 0x0001FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF ),
zSig0,
zSig1
)
&& increment
)
) {
float_raise( float_flag_overflow | float_flag_inexact );
if ( ( roundingMode == float_round_to_zero )
|| ( zSign && ( roundingMode == float_round_up ) )
|| ( ! zSign && ( roundingMode == float_round_down ) )
) {
return
packFloat128(
zSign,
0x7FFE,
LIT64( 0x0000FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF )
);
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( zExp < 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < -1 )
|| ! increment
|| lt128(
zSig0,
zSig1,
LIT64( 0x0001FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF )
);
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
zExp = 0;
if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
if ( roundNearestEven ) {
increment = ( (sbits64) zSig2 < 0 );
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig2;
}
else {
increment = ( roundingMode == float_round_up ) && zSig2;
}
}
}
}
if ( zSig2 ) float_exception_flags |= float_flag_inexact;
if ( increment ) {
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
}
else {
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
}
return packFloat128( zSign, zExp, zSig0, zSig1 );
}
/*
-------------------------------------------------------------------------------
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
and significand formed by the concatenation of `zSig0' and `zSig1', and
returns the proper quadruple-precision floating-point value corresponding to
the abstract input. This routine is just like `roundAndPackFloat128' except
that the input significand has fewer bits and does not have to be normalized
in any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
point exponent.
-------------------------------------------------------------------------------
*/
static float128
normalizeRoundAndPackFloat128(
flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
{
int8 shiftCount;
bits64 zSig2;
if ( zSig0 == 0 ) {
zSig0 = zSig1;
zSig1 = 0;
zExp -= 64;
}
shiftCount = countLeadingZeros64( zSig0 ) - 15;
if ( 0 <= shiftCount ) {
zSig2 = 0;
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
}
else {
shift128ExtraRightJamming(
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
}
zExp -= shiftCount;
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
#endif
/*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
......@@ -1093,33 +823,6 @@ floatx80 int32_to_floatx80( int32 a )
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the result of converting the 32-bit two's complement integer `a' to
the quadruple-precision floating-point format. The conversion is performed
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 int32_to_float128( int32 a )
{
flag zSign;
uint32 absA;
int8 shiftCount;
bits64 zSig0;
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
zSign = ( a < 0 );
absA = zSign ? - a : a;
shiftCount = countLeadingZeros32( absA ) + 17;
zSig0 = absA;
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
}
#endif
/*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
......@@ -1256,40 +959,6 @@ floatx80 float32_to_floatx80( float32 a )
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the result of converting the single-precision floating-point value
`a' to the double-precision floating-point format. The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float32_to_float128( float32 a )
{
flag aSign;
int16 aExp;
bits32 aSig;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
aSign = extractFloat32Sign( a );
if ( aExp == 0xFF ) {
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
return packFloat128( aSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
--aExp;
}
return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
}
#endif
/*
-------------------------------------------------------------------------------
Rounds the single-precision floating-point value `a' to an integer, and
......@@ -2183,41 +1852,6 @@ floatx80 float64_to_floatx80( float64 a )
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the result of converting the double-precision floating-point value
`a' to the quadruple-precision floating-point format. The conversion is
performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float64_to_float128( float64 a )
{
flag aSign;
int16 aExp;
bits64 aSig, zSig0, zSig1;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
aSign = extractFloat64Sign( a );
if ( aExp == 0x7FF ) {
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
return packFloat128( aSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
--aExp;
}
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
}
#endif
/*
-------------------------------------------------------------------------------
Rounds the double-precision floating-point value `a' to an integer, and
......@@ -3029,35 +2663,6 @@ float64 floatx80_to_float64( floatx80 a )
}
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the result of converting the extended double-precision floating-
point value `a' to the quadruple-precision floating-point format. The
conversion is performed according to the IEC/IEEE Standard for Binary
Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 floatx80_to_float128( floatx80 a )
{
flag aSign;
int16 aExp;
bits64 aSig, zSig0, zSig1;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
}
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
return packFloat128( aSign, aExp, zSig0, zSig1 );
}
#endif
/*
-------------------------------------------------------------------------------
Rounds the extended double-precision floating-point value `a' to an integer,
......@@ -3832,1045 +3437,3 @@ flag floatx80_lt_quiet( floatx80 a, floatx80 b )
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point
value `a' to the 32-bit two's complement integer format. The conversion
is performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic---which means in particular that the conversion is rounded
according to the current rounding mode. If `a' is a NaN, the largest
positive integer is returned. Otherwise, if the conversion overflows, the
largest integer with the same sign as `a' is returned.
-------------------------------------------------------------------------------
*/
int32 float128_to_int32( float128 a )
{
flag aSign;
int32 aExp, shiftCount;
bits64 aSig0, aSig1;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
aSig0 |= ( aSig1 != 0 );
shiftCount = 0x4028 - aExp;
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
return roundAndPackInt32( aSign, aSig0 );
}
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point
value `a' to the 32-bit two's complement integer format. The conversion
is performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic, except that the conversion is always rounded toward zero. If
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
conversion overflows, the largest integer with the same sign as `a' is
returned.
-------------------------------------------------------------------------------
*/
int32 float128_to_int32_round_to_zero( float128 a )
{
flag aSign;
int32 aExp, shiftCount;
bits64 aSig0, aSig1, savedASig;
int32 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
aSig0 |= ( aSig1 != 0 );
shiftCount = 0x402F - aExp;
if ( shiftCount < 17 ) {
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
goto invalid;
}
else if ( 48 < shiftCount ) {
if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
return 0;
}
aSig0 |= LIT64( 0x0001000000000000 );
savedASig = aSig0;
aSig0 >>= shiftCount;
z = aSig0;
if ( aSign ) z = - z;
if ( ( z < 0 ) ^ aSign ) {
invalid:
float_exception_flags |= float_flag_invalid;
return aSign ? 0x80000000 : 0x7FFFFFFF;
}
if ( ( aSig0<<shiftCount ) != savedASig ) {
float_exception_flags |= float_flag_inexact;
}
return z;
}
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point
value `a' to the single-precision floating-point format. The conversion
is performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic.
-------------------------------------------------------------------------------
*/
float32 float128_to_float32( float128 a )
{
flag aSign;
int32 aExp;
bits64 aSig0, aSig1;
bits32 zSig;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) {
return commonNaNToFloat32( float128ToCommonNaN( a ) );
}
return packFloat32( aSign, 0xFF, 0 );
}
aSig0 |= ( aSig1 != 0 );
shift64RightJamming( aSig0, 18, &aSig0 );
zSig = aSig0;
if ( aExp || zSig ) {
zSig |= 0x40000000;
aExp -= 0x3F81;
}
return roundAndPackFloat32( aSign, aExp, zSig );
}
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point
value `a' to the double-precision floating-point format. The conversion
is performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic.
-------------------------------------------------------------------------------
*/
float64 float128_to_float64( float128 a )
{
flag aSign;
int32 aExp;
bits64 aSig0, aSig1;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) {
return commonNaNToFloat64( float128ToCommonNaN( a ) );
}
return packFloat64( aSign, 0x7FF, 0 );
}
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
aSig0 |= ( aSig1 != 0 );
if ( aExp || aSig0 ) {
aSig0 |= LIT64( 0x4000000000000000 );
aExp -= 0x3C01;
}
return roundAndPackFloat64( aSign, aExp, aSig0 );
}
#ifdef FLOATX80
/*
-------------------------------------------------------------------------------
Returns the result of converting the quadruple-precision floating-point
value `a' to the extended double-precision floating-point format. The
conversion is performed according to the IEC/IEEE Standard for Binary
Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
floatx80 float128_to_floatx80( float128 a )
{
flag aSign;
int32 aExp;
bits64 aSig0, aSig1;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) {
return commonNaNToFloatx80( float128ToCommonNaN( a ) );
}
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
else {
aSig0 |= LIT64( 0x0001000000000000 );
}
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
}
#endif
/*
-------------------------------------------------------------------------------
Rounds the quadruple-precision floating-point value `a' to an integer, and
returns the result as a quadruple-precision floating-point value. The
operation is performed according to the IEC/IEEE Standard for Binary
Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_round_to_int( float128 a )
{
flag aSign;
int32 aExp;
bits64 lastBitMask, roundBitsMask;
int8 roundingMode;
float128 z;
aExp = extractFloat128Exp( a );
if ( 0x402F <= aExp ) {
if ( 0x406F <= aExp ) {
if ( ( aExp == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
) {
return propagateFloat128NaN( a, a );
}
return a;
}
lastBitMask = 1;
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
roundBitsMask = lastBitMask - 1;
z = a;
roundingMode = float_rounding_mode;
if ( roundingMode == float_round_nearest_even ) {
if ( lastBitMask ) {
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
}
else {
if ( (sbits64) z.low < 0 ) {
++z.high;
if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
}
}
}
else if ( roundingMode != float_round_to_zero ) {
if ( extractFloat128Sign( z )
^ ( roundingMode == float_round_up ) ) {
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
}
}
z.low &= ~ roundBitsMask;
}
else {
if ( aExp <= 0x3FFE ) {
if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
float_exception_flags |= float_flag_inexact;
aSign = extractFloat128Sign( a );
switch ( float_rounding_mode ) {
case float_round_nearest_even:
if ( ( aExp == 0x3FFE )
&& ( extractFloat128Frac0( a )
| extractFloat128Frac1( a ) )
) {
return packFloat128( aSign, 0x3FFF, 0, 0 );
}
break;
case float_round_down:
return
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
: packFloat128( 0, 0, 0, 0 );
case float_round_up:
return
aSign ? packFloat128( 1, 0, 0, 0 )
: packFloat128( 0, 0x3FFF, 0, 0 );
}
return packFloat128( aSign, 0, 0, 0 );
}
lastBitMask = 1;
lastBitMask <<= 0x402F - aExp;
roundBitsMask = lastBitMask - 1;
z.low = 0;
z.high = a.high;
roundingMode = float_rounding_mode;
if ( roundingMode == float_round_nearest_even ) {
z.high += lastBitMask>>1;
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
z.high &= ~ lastBitMask;
}
}
else if ( roundingMode != float_round_to_zero ) {
if ( extractFloat128Sign( z )
^ ( roundingMode == float_round_up ) ) {
z.high |= ( a.low != 0 );
z.high += roundBitsMask;
}
}
z.high &= ~ roundBitsMask;
}
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
float_exception_flags |= float_flag_inexact;
}
return z;
}
/*
-------------------------------------------------------------------------------
Returns the result of adding the absolute values of the quadruple-precision
floating-point values `a' and `b'. If `zSign' is true, the sum is negated
before being returned. `zSign' is ignored if the result is a NaN. The
addition is performed according to the IEC/IEEE Standard for Binary
Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
{
int32 aExp, bExp, zExp;
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
int32 expDiff;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
expDiff = aExp - bExp;
if ( 0 < expDiff ) {
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig0 |= LIT64( 0x0001000000000000 );
}
shift128ExtraRightJamming(
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
zExp = aExp;
}
else if ( expDiff < 0 ) {
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig0 |= LIT64( 0x0001000000000000 );
}
shift128ExtraRightJamming(
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
zExp = bExp;
}
else {
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
return propagateFloat128NaN( a, b );
}
return a;
}
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
zSig2 = 0;
zSig0 |= LIT64( 0x0002000000000000 );
zExp = aExp;
goto shiftRight1;
}
aSig0 |= LIT64( 0x0001000000000000 );
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
--zExp;
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
++zExp;
shiftRight1:
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
roundAndPack:
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
/*
-------------------------------------------------------------------------------
Returns the result of subtracting the absolute values of the quadruple-
precision floating-point values `a' and `b'. If `zSign' is true, the
difference is negated before being returned. `zSign' is ignored if the
result is a NaN. The subtraction is performed according to the IEC/IEEE
Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
{
int32 aExp, bExp, zExp;
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
int32 expDiff;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
expDiff = aExp - bExp;
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
if ( 0 < expDiff ) goto aExpBigger;
if ( expDiff < 0 ) goto bExpBigger;
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
return propagateFloat128NaN( a, b );
}
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
if ( aExp == 0 ) {
aExp = 1;
bExp = 1;
}
if ( bSig0 < aSig0 ) goto aBigger;
if ( aSig0 < bSig0 ) goto bBigger;
if ( bSig1 < aSig1 ) goto aBigger;
if ( aSig1 < bSig1 ) goto bBigger;
return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
bExpBigger:
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig0 |= LIT64( 0x4000000000000000 );
}
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
bSig0 |= LIT64( 0x4000000000000000 );
bBigger:
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
zExp = bExp;
zSign ^= 1;
goto normalizeRoundAndPack;
aExpBigger:
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig0 |= LIT64( 0x4000000000000000 );
}
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
aSig0 |= LIT64( 0x4000000000000000 );
aBigger:
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
zExp = aExp;
normalizeRoundAndPack:
--zExp;
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
}
/*
-------------------------------------------------------------------------------
Returns the result of adding the quadruple-precision floating-point values
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_add( float128 a, float128 b )
{
flag aSign, bSign;
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign == bSign ) {
return addFloat128Sigs( a, b, aSign );
}
else {
return subFloat128Sigs( a, b, aSign );
}
}
/*
-------------------------------------------------------------------------------
Returns the result of subtracting the quadruple-precision floating-point
values `a' and `b'. The operation is performed according to the IEC/IEEE
Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_sub( float128 a, float128 b )
{
flag aSign, bSign;
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign == bSign ) {
return subFloat128Sigs( a, b, aSign );
}
else {
return addFloat128Sigs( a, b, aSign );
}
}
/*
-------------------------------------------------------------------------------
Returns the result of multiplying the quadruple-precision floating-point
values `a' and `b'. The operation is performed according to the IEC/IEEE
Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_mul( float128 a, float128 b )
{
flag aSign, bSign, zSign;
int32 aExp, bExp, zExp;
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
bSign = extractFloat128Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( ( aSig0 | aSig1 )
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
return propagateFloat128NaN( a, b );
}
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
if ( bExp == 0 ) {
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
}
zExp = aExp + bExp - 0x4000;
aSig0 |= LIT64( 0x0001000000000000 );
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
zSig2 |= ( zSig3 != 0 );
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
++zExp;
}
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
/*
-------------------------------------------------------------------------------
Returns the result of dividing the quadruple-precision floating-point value
`a' by the corresponding value `b'. The operation is performed according to
the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_div( float128 a, float128 b )
{
flag aSign, bSign, zSign;
int32 aExp, bExp, zExp;
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
bSign = extractFloat128Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
goto invalid;
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign, 0, 0, 0 );
}
if ( bExp == 0 ) {
if ( ( bSig0 | bSig1 ) == 0 ) {
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
float_raise( float_flag_divbyzero );
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
zExp = aExp - bExp + 0x3FFD;
shortShift128Left(
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
shortShift128Left(
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
++zExp;
}
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
while ( (sbits64) rem0 < 0 ) {
--zSig0;
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
}
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
while ( (sbits64) rem1 < 0 ) {
--zSig1;
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
}
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
}
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
/*
-------------------------------------------------------------------------------
Returns the remainder of the quadruple-precision floating-point value `a'
with respect to the corresponding value `b'. The operation is performed
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_rem( float128 a, float128 b )
{
flag aSign, bSign, zSign;
int32 aExp, bExp, expDiff;
bits64 aSig0, aSig1, bSig0, bSig1;
bits64 q, term0, term1, term2, allZero, alternateASig0, alternateASig1;
bits64 sigMean1;
sbits64 sigMean0;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
bSign = extractFloat128Sign( b );
if ( aExp == 0x7FFF ) {
if ( ( aSig0 | aSig1 )
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
return propagateFloat128NaN( a, b );
}
goto invalid;
}
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return a;
}
if ( bExp == 0 ) {
if ( ( bSig0 | bSig1 ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return a;
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
expDiff = aExp - bExp;
if ( expDiff < -1 ) return a;
shortShift128Left(
aSig0 | LIT64( 0x0001000000000000 ),
aSig1,
15 - ( expDiff < 0 ),
&aSig0,
&aSig1
);
shortShift128Left(
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
q = le128( bSig0, bSig1, aSig0, aSig1 );
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
expDiff -= 64;
while ( 0 < expDiff ) {
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
q = ( 4 < q ) ? q - 4 : 0;
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
expDiff -= 61;
}
if ( -64 < expDiff ) {
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
q = ( 4 < q ) ? q - 4 : 0;
q >>= - expDiff;
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
expDiff += 52;
if ( expDiff < 0 ) {
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
}
else {
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
}
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
}
else {
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
}
do {
alternateASig0 = aSig0;
alternateASig1 = aSig1;
++q;
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
} while ( 0 <= (sbits64) aSig0 );
add128(
aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
if ( ( sigMean0 < 0 )
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
aSig0 = alternateASig0;
aSig1 = alternateASig1;
}
zSign = ( (sbits64) aSig0 < 0 );
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
return
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
}
/*
-------------------------------------------------------------------------------
Returns the square root of the quadruple-precision floating-point value `a'.
The operation is performed according to the IEC/IEEE Standard for Binary
Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
float128 float128_sqrt( float128 a )
{
flag aSign;
int32 aExp, zExp;
bits64 aSig0, aSig1, zSig0, zSig1, zSig2;
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
bits64 shiftedRem0, shiftedRem1;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
if ( ! aSign ) return a;
goto invalid;
}
if ( aSign ) {
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
aSig0 |= LIT64( 0x0001000000000000 );
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
zSig0 <<= 31;
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
mul64To128( zSig0, zSig0, &term0, &term1 );
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
while ( (sbits64) rem0 < 0 ) {
--zSig0;
shortShift128Left( 0, zSig0, 1, &term0, &term1 );
term1 |= 1;
add128( rem0, rem1, term0, term1, &rem0, &rem1 );
}
shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
if ( ( zSig1 & 0x3FFF ) <= 5 ) {
if ( zSig1 == 0 ) zSig1 = 1;
mul64To128( zSig0, zSig1, &term1, &term2 );
shortShift128Left( term1, term2, 1, &term1, &term2 );
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
mul64To128( zSig1, zSig1, &term2, &term3 );
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
while ( (sbits64) rem1 < 0 ) {
--zSig1;
shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
term3 |= 1;
add192(
rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
}
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
}
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise. The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_eq( float128 a, float128 b )
{
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
if ( float128_is_signaling_nan( a )
|| float128_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
return
( a.low == b.low )
&& ( ( a.high == b.high )
|| ( ( a.low == 0 )
&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
);
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise. The comparison
is performed according to the IEC/IEEE Standard for Binary Floating-point
Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_le( float128 a, float128 b )
{
flag aSign, bSign;
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
float_raise( float_flag_invalid );
return 0;
}
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign != bSign ) {
return
aSign
|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
== 0 );
}
return
aSign ? le128( b.high, b.low, a.high, a.low )
: le128( a.high, a.low, b.high, b.low );
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise. The comparison is performed
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_lt( float128 a, float128 b )
{
flag aSign, bSign;
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
float_raise( float_flag_invalid );
return 0;
}
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign != bSign ) {
return
aSign
&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
!= 0 );
}
return
aSign ? lt128( b.high, b.low, a.high, a.low )
: lt128( a.high, a.low, b.high, b.low );
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is equal to
the corresponding value `b', and 0 otherwise. The invalid exception is
raised if either operand is a NaN. Otherwise, the comparison is performed
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_eq_signaling( float128 a, float128 b )
{
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
float_raise( float_flag_invalid );
return 0;
}
return
( a.low == b.low )
&& ( ( a.high == b.high )
|| ( ( a.low == 0 )
&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
);
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is less than
or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
cause an exception. Otherwise, the comparison is performed according to the
IEC/IEEE Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_le_quiet( float128 a, float128 b )
{
flag aSign, bSign;
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
if ( float128_is_signaling_nan( a )
|| float128_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign != bSign ) {
return
aSign
|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
== 0 );
}
return
aSign ? le128( b.high, b.low, a.high, a.low )
: le128( a.high, a.low, b.high, b.low );
}
/*
-------------------------------------------------------------------------------
Returns 1 if the quadruple-precision floating-point value `a' is less than
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
exception. Otherwise, the comparison is performed according to the IEC/IEEE
Standard for Binary Floating-point Arithmetic.
-------------------------------------------------------------------------------
*/
flag float128_lt_quiet( float128 a, float128 b )
{
flag aSign, bSign;
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
) {
if ( float128_is_signaling_nan( a )
|| float128_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign != bSign ) {
return
aSign
&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
!= 0 );
}
return
aSign ? lt128( b.high, b.low, a.high, a.low )
: lt128( a.high, a.low, b.high, b.low );
}
#endif
......@@ -37,12 +37,10 @@ this code that are retained.
The macro `FLOATX80' must be defined to enable the extended double-precision
floating-point format `floatx80'. If this macro is not defined, the
`floatx80' type will not be defined, and none of the functions that either
input or output the `floatx80' type will be defined. The same applies to
the `FLOAT128' macro and the quadruple-precision format `float128'.
input or output the `floatx80' type will be defined.
-------------------------------------------------------------------------------
*/
#define FLOATX80
/* #define FLOAT128 */
/*
-------------------------------------------------------------------------------
......@@ -51,17 +49,10 @@ Software IEC/IEEE floating-point types.
*/
typedef unsigned long int float32;
typedef unsigned long long float64;
#ifdef FLOATX80
typedef struct {
unsigned short high;
unsigned long long low;
} floatx80;
#endif
#ifdef FLOAT128
typedef struct {
unsigned long long high, low;
} float128;
#endif
/*
-------------------------------------------------------------------------------
......@@ -131,9 +122,6 @@ float64 int32_to_float64( signed int );
#ifdef FLOATX80
floatx80 int32_to_floatx80( signed int );
#endif
#ifdef FLOAT128
float128 int32_to_float128( signed int );
#endif
/*
-------------------------------------------------------------------------------
......@@ -146,9 +134,6 @@ float64 float32_to_float64( float32 );
#ifdef FLOATX80
floatx80 float32_to_floatx80( float32 );
#endif
#ifdef FLOAT128
float128 float32_to_float128( float32 );
#endif
/*
-------------------------------------------------------------------------------
......@@ -181,9 +166,6 @@ float32 float64_to_float32( float64 );
#ifdef FLOATX80
floatx80 float64_to_floatx80( float64 );
#endif
#ifdef FLOAT128
float128 float64_to_float128( float64 );
#endif
/*
-------------------------------------------------------------------------------
......@@ -216,9 +198,6 @@ signed int floatx80_to_int32( floatx80 );
signed int floatx80_to_int32_round_to_zero( floatx80 );
float32 floatx80_to_float32( floatx80 );
float64 floatx80_to_float64( floatx80 );
#ifdef FLOAT128
float128 floatx80_to_float128( floatx80 );
#endif
/*
-------------------------------------------------------------------------------
......@@ -250,41 +229,4 @@ char floatx80_is_signaling_nan( floatx80 );
#endif
#ifdef FLOAT128
/*
-------------------------------------------------------------------------------
Software IEC/IEEE quadruple-precision conversion routines.
-------------------------------------------------------------------------------
*/
signed int float128_to_int32( float128 );
signed int float128_to_int32_round_to_zero( float128 );
float32 float128_to_float32( float128 );
float64 float128_to_float64( float128 );
#ifdef FLOATX80
floatx80 float128_to_floatx80( float128 );
#endif
/*
-------------------------------------------------------------------------------
Software IEC/IEEE quadruple-precision operations.
-------------------------------------------------------------------------------
*/
float128 float128_round_to_int( float128 );
float128 float128_add( float128, float128 );
float128 float128_sub( float128, float128 );
float128 float128_mul( float128, float128 );
float128 float128_div( float128, float128 );
float128 float128_rem( float128, float128 );
float128 float128_sqrt( float128 );
char float128_eq( float128, float128 );
char float128_le( float128, float128 );
char float128_lt( float128, float128 );
char float128_eq_signaling( float128, float128 );
char float128_le_quiet( float128, float128 );
char float128_lt_quiet( float128, float128 );
char float128_is_signaling_nan( float128 );
#endif
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment