lib/libkern/softfloat.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
roundAndPackInt32
roundAndPackInt64
extractFloat32Frac
extractFloat32Exp
extractFloat32Sign
normalizeFloat32Subnormal
packFloat32
roundAndPackFloat32
normalizeRoundAndPackFloat32
extractFloat64Frac
extractFloat64Exp
extractFloat64Sign
normalizeFloat64Subnormal
packFloat64
roundAndPackFloat64
normalizeRoundAndPackFloat64
extractFloatx80Frac
extractFloatx80Exp
extractFloatx80Sign
normalizeFloatx80Subnormal
packFloatx80
roundAndPackFloatx80
normalizeRoundAndPackFloatx80
extractFloat128Frac1
extractFloat128Frac0
extractFloat128Exp
extractFloat128Sign
normalizeFloat128Subnormal
packFloat128
roundAndPackFloat128
normalizeRoundAndPackFloat128
int32_to_float32
int32_to_float64
int32_to_floatx80
int32_to_float128
int64_to_float32
int64_to_float64
int64_to_floatx80
int64_to_float128
float32_to_int32
float32_to_int32_round_to_zero
float32_to_int64
float32_to_int64_round_to_zero
float32_to_float64
float32_to_floatx80
float32_to_float128
float32_round_to_int
addFloat32Sigs
subFloat32Sigs
float32_add
float32_sub
float32_mul
float32_div
float32_rem
float32_sqrt
float32_eq
float32_le
float32_lt
float32_eq_signaling
float32_le_quiet
float32_lt_quiet
float64_to_int32
float64_to_int32_round_to_zero
float64_to_int64
float64_to_int64_round_to_zero
float64_to_float32
float64_to_floatx80
float64_to_float128
float64_round_to_int
addFloat64Sigs
subFloat64Sigs
float64_add
float64_sub
float64_mul
float64_div
float64_rem
float64_sqrt
float64_eq
float64_le
float64_lt
float64_eq_signaling
float64_le_quiet
float64_lt_quiet
floatx80_to_int32
floatx80_to_int32_round_to_zero
floatx80_to_int64
floatx80_to_int64_round_to_zero
floatx80_to_float32
floatx80_to_float64
floatx80_to_float128
floatx80_round_to_int
addFloatx80Sigs
subFloatx80Sigs
floatx80_add
floatx80_sub
floatx80_mul
floatx80_div
floatx80_rem
floatx80_sqrt
floatx80_eq
floatx80_le
floatx80_lt
floatx80_eq_signaling
floatx80_le_quiet
floatx80_lt_quiet
float128_to_int32
float128_to_int32_round_to_zero
float128_to_int64
float128_to_int64_round_to_zero
float128_to_float32
float128_to_float64
float128_to_floatx80
float128_round_to_int
addFloat128Sigs
subFloat128Sigs
float128_add
float128_sub
float128_mul
float128_div
float128_rem
float128_sqrt
float128_eq
float128_le
float128_lt
float128_eq_signaling
float128_le_quiet
float128_lt_quiet
float64_to_uint32_round_to_zero
float32_to_uint32_round_to_zero
    1 /*      $OpenBSD: softfloat.c,v 1.1 2002/04/28 20:55:14 pvalchev Exp $  */
    2 /*      $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $       */
    3 
    4 /*
    5  * This version hacked for use with gcc -msoft-float by bjh21.
    6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
    7  *  itself).
    8  */
    9 
   10 /*
   11  * Things you may want to define:
   12  *
   13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
   14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
   15  *   properly renamed.
   16  */
   17 
   18 /*
   19 ===============================================================================
   20 
   21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   22 Arithmetic Package, Release 2a.
   23 
   24 Written by John R. Hauser.  This work was made possible in part by the
   25 International Computer Science Institute, located at Suite 600, 1947 Center
   26 Street, Berkeley, California 94704.  Funding was partially provided by the
   27 National Science Foundation under grant MIP-9311980.  The original version
   28 of this code was written as part of a project to build a fixed-point vector
   29 processor in collaboration with the University of California at Berkeley,
   30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
   31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
   32 arithmetic/SoftFloat.html'.
   33 
   34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
   35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
   36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
   37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
   38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
   39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
   40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
   41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
   42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
   43 
   44 Derivative works are acceptable, even for commercial purposes, so long as
   45 (1) they include prominent notice that the work is derivative, and (2) they
   46 include prominent notice akin to these four paragraphs for those parts of
   47 this code that are retained.
   48 
   49 ===============================================================================
   50 */
   51 
   52 #ifndef NO_IEEE
   53 
   54 #include <sys/cdefs.h>
   55 #if defined(LIBC_SCCS) && !defined(lint)
   56 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
   57 #endif /* LIBC_SCCS and not lint */
   58 
   59 #ifdef SOFTFLOAT_FOR_GCC
   60 #include "softfloat-for-gcc.h"
   61 #endif
   62 
   63 #include "milieu.h"
   64 #include "softfloat.h"
   65 
   66 /*
   67  * Conversions between floats as stored in memory and floats as
   68  * SoftFloat uses them
   69  */
   70 #ifndef FLOAT64_DEMANGLE
   71 #define FLOAT64_DEMANGLE(a)     (a)
   72 #endif
   73 #ifndef FLOAT64_MANGLE
   74 #define FLOAT64_MANGLE(a)       (a)
   75 #endif
   76 
   77 /*
   78 -------------------------------------------------------------------------------
   79 Floating-point rounding mode, extended double-precision rounding precision,
   80 and exception flags.
   81 -------------------------------------------------------------------------------
   82 */
   83 
   84 /*
   85  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
   86  *      Right now, it does not.  I've removed all other dynamic global
   87  *      variables. [ross]
   88  */
   89 #ifdef FLOATX80
   90 int8 floatx80_rounding_precision = 80;
   91 #endif
   92 
   93 /*
   94 -------------------------------------------------------------------------------
   95 Primitive arithmetic functions, including multi-word arithmetic, and
   96 division and square root approximations.  (Can be specialized to target if
   97 desired.)
   98 -------------------------------------------------------------------------------
   99 */
  100 #include "softfloat-macros.h"
  101 
  102 /*
  103 -------------------------------------------------------------------------------
  104 Functions and definitions to determine:  (1) whether tininess for underflow
  105 is detected before or after rounding by default, (2) what (if anything)
  106 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  107 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  108 are propagated from function inputs to output.  These details are target-
  109 specific.
  110 -------------------------------------------------------------------------------
  111 */
  112 #include "softfloat-specialize.h"
  113 
  114 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
  115 /*
  116 -------------------------------------------------------------------------------
  117 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  118 and 7, and returns the properly rounded 32-bit integer corresponding to the
  119 input.  If `zSign' is 1, the input is negated before being converted to an
  120 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  121 is simply rounded to an integer, with the inexact exception raised if the
  122 input cannot be represented exactly as an integer.  However, if the fixed-
  123 point input is too large, the invalid exception is raised and the largest
  124 positive or negative integer is returned.
  125 -------------------------------------------------------------------------------
  126 */
  127 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  128 {
  129     int8 roundingMode;
  130     flag roundNearestEven;
  131     int8 roundIncrement, roundBits;
  132     int32 z;
  133 
  134     roundingMode = float_rounding_mode();
  135     roundNearestEven = ( roundingMode == float_round_nearest_even );
  136     roundIncrement = 0x40;
  137     if ( ! roundNearestEven ) {
  138         if ( roundingMode == float_round_to_zero ) {
  139             roundIncrement = 0;
  140         }
  141         else {
  142             roundIncrement = 0x7F;
  143             if ( zSign ) {
  144                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  145             }
  146             else {
  147                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  148             }
  149         }
  150     }
  151     roundBits = absZ & 0x7F;
  152     absZ = ( absZ + roundIncrement )>>7;
  153     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  154     z = absZ;
  155     if ( zSign ) z = - z;
  156     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
  157         float_raise( float_flag_invalid );
  158         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  159     }
  160     if ( roundBits ) float_set_inexact();
  161     return z;
  162 
  163 }
  164 
  165 /*
  166 -------------------------------------------------------------------------------
  167 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  168 `absZ1', with binary point between bits 63 and 64 (between the input words),
  169 and returns the properly rounded 64-bit integer corresponding to the input.
  170 If `zSign' is 1, the input is negated before being converted to an integer.
  171 Ordinarily, the fixed-point input is simply rounded to an integer, with
  172 the inexact exception raised if the input cannot be represented exactly as
  173 an integer.  However, if the fixed-point input is too large, the invalid
  174 exception is raised and the largest positive or negative integer is
  175 returned.
  176 -------------------------------------------------------------------------------
  177 */
  178 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
  179 {
  180     int8 roundingMode;
  181     flag roundNearestEven, increment;
  182     int64 z;
  183 
  184     roundingMode = float_rounding_mode();
  185     roundNearestEven = ( roundingMode == float_round_nearest_even );
  186     increment = ( (sbits64) absZ1 < 0 );
  187     if ( ! roundNearestEven ) {
  188         if ( roundingMode == float_round_to_zero ) {
  189             increment = 0;
  190         }
  191         else {
  192             if ( zSign ) {
  193                 increment = ( roundingMode == float_round_down ) && absZ1;
  194             }
  195             else {
  196                 increment = ( roundingMode == float_round_up ) && absZ1;
  197             }
  198         }
  199     }
  200     if ( increment ) {
  201         ++absZ0;
  202         if ( absZ0 == 0 ) goto overflow;
  203         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
  204     }
  205     z = absZ0;
  206     if ( zSign ) z = - z;
  207     if ( z && ( ( z < 0 ) ^ zSign ) ) {
  208  overflow:
  209         float_raise( float_flag_invalid );
  210         return
  211               zSign ? (sbits64) LIT64( 0x8000000000000000 )
  212             : LIT64( 0x7FFFFFFFFFFFFFFF );
  213     }
  214     if ( absZ1 ) float_set_inexact();
  215     return z;
  216 
  217 }
  218 #endif
  219 
  220 /*
  221 -------------------------------------------------------------------------------
  222 Returns the fraction bits of the single-precision floating-point value `a'.
  223 -------------------------------------------------------------------------------
  224 */
  225 INLINE bits32 extractFloat32Frac( float32 a )
  226 {
  227 
  228     return a & 0x007FFFFF;
  229 
  230 }
  231 
  232 /*
  233 -------------------------------------------------------------------------------
  234 Returns the exponent bits of the single-precision floating-point value `a'.
  235 -------------------------------------------------------------------------------
  236 */
  237 INLINE int16 extractFloat32Exp( float32 a )
  238 {
  239 
  240     return ( a>>23 ) & 0xFF;
  241 
  242 }
  243 
  244 /*
  245 -------------------------------------------------------------------------------
  246 Returns the sign bit of the single-precision floating-point value `a'.
  247 -------------------------------------------------------------------------------
  248 */
  249 INLINE flag extractFloat32Sign( float32 a )
  250 {
  251 
  252     return a>>31;
  253 
  254 }
  255 
  256 /*
  257 -------------------------------------------------------------------------------
  258 Normalizes the subnormal single-precision floating-point value represented
  259 by the denormalized significand `aSig'.  The normalized exponent and
  260 significand are stored at the locations pointed to by `zExpPtr' and
  261 `zSigPtr', respectively.
  262 -------------------------------------------------------------------------------
  263 */
  264 static void
  265  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
  266 {
  267     int8 shiftCount;
  268 
  269     shiftCount = countLeadingZeros32( aSig ) - 8;
  270     *zSigPtr = aSig<<shiftCount;
  271     *zExpPtr = 1 - shiftCount;
  272 
  273 }
  274 
  275 /*
  276 -------------------------------------------------------------------------------
  277 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  278 single-precision floating-point value, returning the result.  After being
  279 shifted into the proper positions, the three fields are simply added
  280 together to form the result.  This means that any integer portion of `zSig'
  281 will be added into the exponent.  Since a properly normalized significand
  282 will have an integer portion equal to 1, the `zExp' input should be 1 less
  283 than the desired result exponent whenever `zSig' is a complete, normalized
  284 significand.
  285 -------------------------------------------------------------------------------
  286 */
  287 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
  288 {
  289 
  290     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
  291 
  292 }
  293 
  294 /*
  295 -------------------------------------------------------------------------------
  296 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  297 and significand `zSig', and returns the proper single-precision floating-
  298 point value corresponding to the abstract input.  Ordinarily, the abstract
  299 value is simply rounded and packed into the single-precision format, with
  300 the inexact exception raised if the abstract input cannot be represented
  301 exactly.  However, if the abstract value is too large, the overflow and
  302 inexact exceptions are raised and an infinity or maximal finite value is
  303 returned.  If the abstract value is too small, the input value is rounded to
  304 a subnormal number, and the underflow and inexact exceptions are raised if
  305 the abstract input cannot be represented exactly as a subnormal single-
  306 precision floating-point number.
  307     The input significand `zSig' has its binary point between bits 30
  308 and 29, which is 7 bits to the left of the usual location.  This shifted
  309 significand must be normalized or smaller.  If `zSig' is not normalized,
  310 `zExp' must be 0; in that case, the result returned is a subnormal number,
  311 and it must not require rounding.  In the usual case that `zSig' is
  312 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  313 The handling of underflow and overflow follows the IEC/IEEE Standard for
  314 Binary Floating-Point Arithmetic.
  315 -------------------------------------------------------------------------------
  316 */
  317 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
  318 {
  319     int8 roundingMode;
  320     flag roundNearestEven;
  321     int8 roundIncrement, roundBits;
  322     flag isTiny;
  323 
  324     roundingMode = float_rounding_mode();
  325     roundNearestEven = ( roundingMode == float_round_nearest_even );
  326     roundIncrement = 0x40;
  327     if ( ! roundNearestEven ) {
  328         if ( roundingMode == float_round_to_zero ) {
  329             roundIncrement = 0;
  330         }
  331         else {
  332             roundIncrement = 0x7F;
  333             if ( zSign ) {
  334                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  335             }
  336             else {
  337                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  338             }
  339         }
  340     }
  341     roundBits = zSig & 0x7F;
  342     if ( 0xFD <= (bits16) zExp ) {
  343         if (    ( 0xFD < zExp )
  344              || (    ( zExp == 0xFD )
  345                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
  346            ) {
  347             float_raise( float_flag_overflow | float_flag_inexact );
  348             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
  349         }
  350         if ( zExp < 0 ) {
  351             isTiny =
  352                    ( float_detect_tininess == float_tininess_before_rounding )
  353                 || ( zExp < -1 )
  354                 || ( zSig + roundIncrement < 0x80000000 );
  355             shift32RightJamming( zSig, - zExp, &zSig );
  356             zExp = 0;
  357             roundBits = zSig & 0x7F;
  358             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  359         }
  360     }
  361     if ( roundBits ) float_set_inexact();
  362     zSig = ( zSig + roundIncrement )>>7;
  363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  364     if ( zSig == 0 ) zExp = 0;
  365     return packFloat32( zSign, zExp, zSig );
  366 
  367 }
  368 
  369 /*
  370 -------------------------------------------------------------------------------
  371 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  372 and significand `zSig', and returns the proper single-precision floating-
  373 point value corresponding to the abstract input.  This routine is just like
  374 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  375 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  376 floating-point exponent.
  377 -------------------------------------------------------------------------------
  378 */
  379 static float32
  380  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
  381 {
  382     int8 shiftCount;
  383 
  384     shiftCount = countLeadingZeros32( zSig ) - 1;
  385     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
  386 
  387 }
  388 
  389 /*
  390 -------------------------------------------------------------------------------
  391 Returns the fraction bits of the double-precision floating-point value `a'.
  392 -------------------------------------------------------------------------------
  393 */
  394 INLINE bits64 extractFloat64Frac( float64 a )
  395 {
  396 
  397     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
  398 
  399 }
  400 
  401 /*
  402 -------------------------------------------------------------------------------
  403 Returns the exponent bits of the double-precision floating-point value `a'.
  404 -------------------------------------------------------------------------------
  405 */
  406 INLINE int16 extractFloat64Exp( float64 a )
  407 {
  408 
  409     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
  410 
  411 }
  412 
  413 /*
  414 -------------------------------------------------------------------------------
  415 Returns the sign bit of the double-precision floating-point value `a'.
  416 -------------------------------------------------------------------------------
  417 */
  418 INLINE flag extractFloat64Sign( float64 a )
  419 {
  420 
  421     return FLOAT64_DEMANGLE(a)>>63;
  422 
  423 }
  424 
  425 /*
  426 -------------------------------------------------------------------------------
  427 Normalizes the subnormal double-precision floating-point value represented
  428 by the denormalized significand `aSig'.  The normalized exponent and
  429 significand are stored at the locations pointed to by `zExpPtr' and
  430 `zSigPtr', respectively.
  431 -------------------------------------------------------------------------------
  432 */
  433 static void
  434  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
  435 {
  436     int8 shiftCount;
  437 
  438     shiftCount = countLeadingZeros64( aSig ) - 11;
  439     *zSigPtr = aSig<<shiftCount;
  440     *zExpPtr = 1 - shiftCount;
  441 
  442 }
  443 
  444 /*
  445 -------------------------------------------------------------------------------
  446 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  447 double-precision floating-point value, returning the result.  After being
  448 shifted into the proper positions, the three fields are simply added
  449 together to form the result.  This means that any integer portion of `zSig'
  450 will be added into the exponent.  Since a properly normalized significand
  451 will have an integer portion equal to 1, the `zExp' input should be 1 less
  452 than the desired result exponent whenever `zSig' is a complete, normalized
  453 significand.
  454 -------------------------------------------------------------------------------
  455 */
  456 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
  457 {
  458 
  459     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
  460                            ( ( (bits64) zExp )<<52 ) + zSig );
  461 
  462 }
  463 
  464 /*
  465 -------------------------------------------------------------------------------
  466 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  467 and significand `zSig', and returns the proper double-precision floating-
  468 point value corresponding to the abstract input.  Ordinarily, the abstract
  469 value is simply rounded and packed into the double-precision format, with
  470 the inexact exception raised if the abstract input cannot be represented
  471 exactly.  However, if the abstract value is too large, the overflow and
  472 inexact exceptions are raised and an infinity or maximal finite value is
  473 returned.  If the abstract value is too small, the input value is rounded to
  474 a subnormal number, and the underflow and inexact exceptions are raised if
  475 the abstract input cannot be represented exactly as a subnormal double-
  476 precision floating-point number.
  477     The input significand `zSig' has its binary point between bits 62
  478 and 61, which is 10 bits to the left of the usual location.  This shifted
  479 significand must be normalized or smaller.  If `zSig' is not normalized,
  480 `zExp' must be 0; in that case, the result returned is a subnormal number,
  481 and it must not require rounding.  In the usual case that `zSig' is
  482 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  483 The handling of underflow and overflow follows the IEC/IEEE Standard for
  484 Binary Floating-Point Arithmetic.
  485 -------------------------------------------------------------------------------
  486 */
  487 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
  488 {
  489     int8 roundingMode;
  490     flag roundNearestEven;
  491     int16 roundIncrement, roundBits;
  492     flag isTiny;
  493 
  494     roundingMode = float_rounding_mode();
  495     roundNearestEven = ( roundingMode == float_round_nearest_even );
  496     roundIncrement = 0x200;
  497     if ( ! roundNearestEven ) {
  498         if ( roundingMode == float_round_to_zero ) {
  499             roundIncrement = 0;
  500         }
  501         else {
  502             roundIncrement = 0x3FF;
  503             if ( zSign ) {
  504                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  505             }
  506             else {
  507                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  508             }
  509         }
  510     }
  511     roundBits = zSig & 0x3FF;
  512     if ( 0x7FD <= (bits16) zExp ) {
  513         if (    ( 0x7FD < zExp )
  514              || (    ( zExp == 0x7FD )
  515                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
  516            ) {
  517             float_raise( float_flag_overflow | float_flag_inexact );
  518             return FLOAT64_MANGLE(
  519                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
  520                 ( roundIncrement == 0 ));
  521         }
  522         if ( zExp < 0 ) {
  523             isTiny =
  524                    ( float_detect_tininess == float_tininess_before_rounding )
  525                 || ( zExp < -1 )
  526                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
  527             shift64RightJamming( zSig, - zExp, &zSig );
  528             zExp = 0;
  529             roundBits = zSig & 0x3FF;
  530             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  531         }
  532     }
  533     if ( roundBits ) float_set_inexact();
  534     zSig = ( zSig + roundIncrement )>>10;
  535     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
  536     if ( zSig == 0 ) zExp = 0;
  537     return packFloat64( zSign, zExp, zSig );
  538 
  539 }
  540 
  541 /*
  542 -------------------------------------------------------------------------------
  543 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  544 and significand `zSig', and returns the proper double-precision floating-
  545 point value corresponding to the abstract input.  This routine is just like
  546 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  547 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  548 floating-point exponent.
  549 -------------------------------------------------------------------------------
  550 */
  551 static float64
  552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
  553 {
  554     int8 shiftCount;
  555 
  556     shiftCount = countLeadingZeros64( zSig ) - 1;
  557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
  558 
  559 }
  560 
  561 #ifdef FLOATX80
  562 
  563 /*
  564 -------------------------------------------------------------------------------
  565 Returns the fraction bits of the extended double-precision floating-point
  566 value `a'.
  567 -------------------------------------------------------------------------------
  568 */
  569 INLINE bits64 extractFloatx80Frac( floatx80 a )
  570 {
  571 
  572     return a.low;
  573 
  574 }
  575 
  576 /*
  577 -------------------------------------------------------------------------------
  578 Returns the exponent bits of the extended double-precision floating-point
  579 value `a'.
  580 -------------------------------------------------------------------------------
  581 */
  582 INLINE int32 extractFloatx80Exp( floatx80 a )
  583 {
  584 
  585     return a.high & 0x7FFF;
  586 
  587 }
  588 
  589 /*
  590 -------------------------------------------------------------------------------
  591 Returns the sign bit of the extended double-precision floating-point value
  592 `a'.
  593 -------------------------------------------------------------------------------
  594 */
  595 INLINE flag extractFloatx80Sign( floatx80 a )
  596 {
  597 
  598     return a.high>>15;
  599 
  600 }
  601 
  602 /*
  603 -------------------------------------------------------------------------------
  604 Normalizes the subnormal extended double-precision floating-point value
  605 represented by the denormalized significand `aSig'.  The normalized exponent
  606 and significand are stored at the locations pointed to by `zExpPtr' and
  607 `zSigPtr', respectively.
  608 -------------------------------------------------------------------------------
  609 */
  610 static void
  611  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
  612 {
  613     int8 shiftCount;
  614 
  615     shiftCount = countLeadingZeros64( aSig );
  616     *zSigPtr = aSig<<shiftCount;
  617     *zExpPtr = 1 - shiftCount;
  618 
  619 }
  620 
  621 /*
  622 -------------------------------------------------------------------------------
  623 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  624 extended double-precision floating-point value, returning the result.
  625 -------------------------------------------------------------------------------
  626 */
  627 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
  628 {
  629     floatx80 z;
  630 
  631     z.low = zSig;
  632     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
  633     return z;
  634 
  635 }
  636 
  637 /*
  638 -------------------------------------------------------------------------------
  639 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  640 and extended significand formed by the concatenation of `zSig0' and `zSig1',
  641 and returns the proper extended double-precision floating-point value
  642 corresponding to the abstract input.  Ordinarily, the abstract value is
  643 rounded and packed into the extended double-precision format, with the
  644 inexact exception raised if the abstract input cannot be represented
  645 exactly.  However, if the abstract value is too large, the overflow and
  646 inexact exceptions are raised and an infinity or maximal finite value is
  647 returned.  If the abstract value is too small, the input value is rounded to
  648 a subnormal number, and the underflow and inexact exceptions are raised if
  649 the abstract input cannot be represented exactly as a subnormal extended
  650 double-precision floating-point number.
  651     If `roundingPrecision' is 32 or 64, the result is rounded to the same
  652 number of bits as single or double precision, respectively.  Otherwise, the
  653 result is rounded to the full precision of the extended double-precision
  654 format.
  655     The input significand must be normalized or smaller.  If the input
  656 significand is not normalized, `zExp' must be 0; in that case, the result
  657 returned is a subnormal number, and it must not require rounding.  The
  658 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  659 Floating-Point Arithmetic.
  660 -------------------------------------------------------------------------------
  661 */
  662 static floatx80
  663  roundAndPackFloatx80(
  664      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  665  )
  666 {
  667     int8 roundingMode;
  668     flag roundNearestEven, increment, isTiny;
  669     int64 roundIncrement, roundMask, roundBits;
  670 
  671     roundingMode = float_rounding_mode();
  672     roundNearestEven = ( roundingMode == float_round_nearest_even );
  673     if ( roundingPrecision == 80 ) goto precision80;
  674     if ( roundingPrecision == 64 ) {
  675         roundIncrement = LIT64( 0x0000000000000400 );
  676         roundMask = LIT64( 0x00000000000007FF );
  677     }
  678     else if ( roundingPrecision == 32 ) {
  679         roundIncrement = LIT64( 0x0000008000000000 );
  680         roundMask = LIT64( 0x000000FFFFFFFFFF );
  681     }
  682     else {
  683         goto precision80;
  684     }
  685     zSig0 |= ( zSig1 != 0 );
  686     if ( ! roundNearestEven ) {
  687         if ( roundingMode == float_round_to_zero ) {
  688             roundIncrement = 0;
  689         }
  690         else {
  691             roundIncrement = roundMask;
  692             if ( zSign ) {
  693                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  694             }
  695             else {
  696                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  697             }
  698         }
  699     }
  700     roundBits = zSig0 & roundMask;
  701     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  702         if (    ( 0x7FFE < zExp )
  703              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
  704            ) {
  705             goto overflow;
  706         }
  707         if ( zExp <= 0 ) {
  708             isTiny =
  709                    ( float_detect_tininess == float_tininess_before_rounding )
  710                 || ( zExp < 0 )
  711                 || ( zSig0 <= zSig0 + roundIncrement );
  712             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  713             zExp = 0;
  714             roundBits = zSig0 & roundMask;
  715             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  716             if ( roundBits ) float_set_inexact();
  717             zSig0 += roundIncrement;
  718             if ( (sbits64) zSig0 < 0 ) zExp = 1;
  719             roundIncrement = roundMask + 1;
  720             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  721                 roundMask |= roundIncrement;
  722             }
  723             zSig0 &= ~ roundMask;
  724             return packFloatx80( zSign, zExp, zSig0 );
  725         }
  726     }
  727     if ( roundBits ) float_set_inexact();
  728     zSig0 += roundIncrement;
  729     if ( zSig0 < roundIncrement ) {
  730         ++zExp;
  731         zSig0 = LIT64( 0x8000000000000000 );
  732     }
  733     roundIncrement = roundMask + 1;
  734     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  735         roundMask |= roundIncrement;
  736     }
  737     zSig0 &= ~ roundMask;
  738     if ( zSig0 == 0 ) zExp = 0;
  739     return packFloatx80( zSign, zExp, zSig0 );
  740  precision80:
  741     increment = ( (sbits64) zSig1 < 0 );
  742     if ( ! roundNearestEven ) {
  743         if ( roundingMode == float_round_to_zero ) {
  744             increment = 0;
  745         }
  746         else {
  747             if ( zSign ) {
  748                 increment = ( roundingMode == float_round_down ) && zSig1;
  749             }
  750             else {
  751                 increment = ( roundingMode == float_round_up ) && zSig1;
  752             }
  753         }
  754     }
  755     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  756         if (    ( 0x7FFE < zExp )
  757              || (    ( zExp == 0x7FFE )
  758                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
  759                   && increment
  760                 )
  761            ) {
  762             roundMask = 0;
  763  overflow:
  764             float_raise( float_flag_overflow | float_flag_inexact );
  765             if (    ( roundingMode == float_round_to_zero )
  766                  || ( zSign && ( roundingMode == float_round_up ) )
  767                  || ( ! zSign && ( roundingMode == float_round_down ) )
  768                ) {
  769                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
  770             }
  771             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  772         }
  773         if ( zExp <= 0 ) {
  774             isTiny =
  775                    ( float_detect_tininess == float_tininess_before_rounding )
  776                 || ( zExp < 0 )
  777                 || ! increment
  778                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
  779             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  780             zExp = 0;
  781             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
  782             if ( zSig1 ) float_set_inexact();
  783             if ( roundNearestEven ) {
  784                 increment = ( (sbits64) zSig1 < 0 );
  785             }
  786             else {
  787                 if ( zSign ) {
  788                     increment = ( roundingMode == float_round_down ) && zSig1;
  789                 }
  790                 else {
  791                     increment = ( roundingMode == float_round_up ) && zSig1;
  792                 }
  793             }
  794             if ( increment ) {
  795                 ++zSig0;
  796                 zSig0 &=
  797                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  798                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
  799             }
  800             return packFloatx80( zSign, zExp, zSig0 );
  801         }
  802     }
  803     if ( zSig1 ) float_set_inexact();
  804     if ( increment ) {
  805         ++zSig0;
  806         if ( zSig0 == 0 ) {
  807             ++zExp;
  808             zSig0 = LIT64( 0x8000000000000000 );
  809         }
  810         else {
  811             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  812         }
  813     }
  814     else {
  815         if ( zSig0 == 0 ) zExp = 0;
  816     }
  817     return packFloatx80( zSign, zExp, zSig0 );
  818 
  819 }
  820 
  821 /*
  822 -------------------------------------------------------------------------------
  823 Takes an abstract floating-point value having sign `zSign', exponent
  824 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  825 and returns the proper extended double-precision floating-point value
  826 corresponding to the abstract input.  This routine is just like
  827 `roundAndPackFloatx80' except that the input significand does not have to be
  828 normalized.
  829 -------------------------------------------------------------------------------
  830 */
  831 static floatx80
  832  normalizeRoundAndPackFloatx80(
  833      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  834  )
  835 {
  836     int8 shiftCount;
  837 
  838     if ( zSig0 == 0 ) {
  839         zSig0 = zSig1;
  840         zSig1 = 0;
  841         zExp -= 64;
  842     }
  843     shiftCount = countLeadingZeros64( zSig0 );
  844     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  845     zExp -= shiftCount;
  846     return
  847         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  848 
  849 }
  850 
  851 #endif
  852 
  853 #ifdef FLOAT128
  854 
  855 /*
  856 -------------------------------------------------------------------------------
  857 Returns the least-significant 64 fraction bits of the quadruple-precision
  858 floating-point value `a'.
  859 -------------------------------------------------------------------------------
  860 */
  861 INLINE bits64 extractFloat128Frac1( float128 a )
  862 {
  863 
  864     return a.low;
  865 
  866 }
  867 
  868 /*
  869 -------------------------------------------------------------------------------
  870 Returns the most-significant 48 fraction bits of the quadruple-precision
  871 floating-point value `a'.
  872 -------------------------------------------------------------------------------
  873 */
  874 INLINE bits64 extractFloat128Frac0( float128 a )
  875 {
  876 
  877     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
  878 
  879 }
  880 
  881 /*
  882 -------------------------------------------------------------------------------
  883 Returns the exponent bits of the quadruple-precision floating-point value
  884 `a'.
  885 -------------------------------------------------------------------------------
  886 */
  887 INLINE int32 extractFloat128Exp( float128 a )
  888 {
  889 
  890     return ( a.high>>48 ) & 0x7FFF;
  891 
  892 }
  893 
  894 /*
  895 -------------------------------------------------------------------------------
  896 Returns the sign bit of the quadruple-precision floating-point value `a'.
  897 -------------------------------------------------------------------------------
  898 */
  899 INLINE flag extractFloat128Sign( float128 a )
  900 {
  901 
  902     return a.high>>63;
  903 
  904 }
  905 
  906 /*
  907 -------------------------------------------------------------------------------
  908 Normalizes the subnormal quadruple-precision floating-point value
  909 represented by the denormalized significand formed by the concatenation of
  910 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
  911 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
  912 significand are stored at the location pointed to by `zSig0Ptr', and the
  913 least significant 64 bits of the normalized significand are stored at the
  914 location pointed to by `zSig1Ptr'.
  915 -------------------------------------------------------------------------------
  916 */
  917 static void
  918  normalizeFloat128Subnormal(
  919      bits64 aSig0,
  920      bits64 aSig1,
  921      int32 *zExpPtr,
  922      bits64 *zSig0Ptr,
  923      bits64 *zSig1Ptr
  924  )
  925 {
  926     int8 shiftCount;
  927 
  928     if ( aSig0 == 0 ) {
  929         shiftCount = countLeadingZeros64( aSig1 ) - 15;
  930         if ( shiftCount < 0 ) {
  931             *zSig0Ptr = aSig1>>( - shiftCount );
  932             *zSig1Ptr = aSig1<<( shiftCount & 63 );
  933         }
  934         else {
  935             *zSig0Ptr = aSig1<<shiftCount;
  936             *zSig1Ptr = 0;
  937         }
  938         *zExpPtr = - shiftCount - 63;
  939     }
  940     else {
  941         shiftCount = countLeadingZeros64( aSig0 ) - 15;
  942         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  943         *zExpPtr = 1 - shiftCount;
  944     }
  945 
  946 }
  947 
  948 /*
  949 -------------------------------------------------------------------------------
  950 Packs the sign `zSign', the exponent `zExp', and the significand formed
  951 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  952 floating-point value, returning the result.  After being shifted into the
  953 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  954 added together to form the most significant 32 bits of the result.  This
  955 means that any integer portion of `zSig0' will be added into the exponent.
  956 Since a properly normalized significand will have an integer portion equal
  957 to 1, the `zExp' input should be 1 less than the desired result exponent
  958 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  959 significand.
  960 -------------------------------------------------------------------------------
  961 */
  962 INLINE float128
  963  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
  964 {
  965     float128 z;
  966 
  967     z.low = zSig1;
  968     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
  969     return z;
  970 
  971 }
  972 
  973 /*
  974 -------------------------------------------------------------------------------
  975 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  976 and extended significand formed by the concatenation of `zSig0', `zSig1',
  977 and `zSig2', and returns the proper quadruple-precision floating-point value
  978 corresponding to the abstract input.  Ordinarily, the abstract value is
  979 simply rounded and packed into the quadruple-precision format, with the
  980 inexact exception raised if the abstract input cannot be represented
  981 exactly.  However, if the abstract value is too large, the overflow and
  982 inexact exceptions are raised and an infinity or maximal finite value is
  983 returned.  If the abstract value is too small, the input value is rounded to
  984 a subnormal number, and the underflow and inexact exceptions are raised if
  985 the abstract input cannot be represented exactly as a subnormal quadruple-
  986 precision floating-point number.
  987     The input significand must be normalized or smaller.  If the input
  988 significand is not normalized, `zExp' must be 0; in that case, the result
  989 returned is a subnormal number, and it must not require rounding.  In the
  990 usual case that the input significand is normalized, `zExp' must be 1 less
  991 than the ``true'' floating-point exponent.  The handling of underflow and
  992 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  993 -------------------------------------------------------------------------------
  994 */
  995 static float128
  996  roundAndPackFloat128(
  997      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
  998 {
  999     int8 roundingMode;
 1000     flag roundNearestEven, increment, isTiny;
 1001 
 1002     roundingMode = float_rounding_mode();
 1003     roundNearestEven = ( roundingMode == float_round_nearest_even );
 1004     increment = ( (sbits64) zSig2 < 0 );
 1005     if ( ! roundNearestEven ) {
 1006         if ( roundingMode == float_round_to_zero ) {
 1007             increment = 0;
 1008         }
 1009         else {
 1010             if ( zSign ) {
 1011                 increment = ( roundingMode == float_round_down ) && zSig2;
 1012             }
 1013             else {
 1014                 increment = ( roundingMode == float_round_up ) && zSig2;
 1015             }
 1016         }
 1017     }
 1018     if ( 0x7FFD <= (bits32) zExp ) {
 1019         if (    ( 0x7FFD < zExp )
 1020              || (    ( zExp == 0x7FFD )
 1021                   && eq128(
 1022                          LIT64( 0x0001FFFFFFFFFFFF ),
 1023                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 1024                          zSig0,
 1025                          zSig1
 1026                      )
 1027                   && increment
 1028                 )
 1029            ) {
 1030             float_raise( float_flag_overflow | float_flag_inexact );
 1031             if (    ( roundingMode == float_round_to_zero )
 1032                  || ( zSign && ( roundingMode == float_round_up ) )
 1033                  || ( ! zSign && ( roundingMode == float_round_down ) )
 1034                ) {
 1035                 return
 1036                     packFloat128(
 1037                         zSign,
 1038                         0x7FFE,
 1039                         LIT64( 0x0000FFFFFFFFFFFF ),
 1040                         LIT64( 0xFFFFFFFFFFFFFFFF )
 1041                     );
 1042             }
 1043             return packFloat128( zSign, 0x7FFF, 0, 0 );
 1044         }
 1045         if ( zExp < 0 ) {
 1046             isTiny =
 1047                    ( float_detect_tininess == float_tininess_before_rounding )
 1048                 || ( zExp < -1 )
 1049                 || ! increment
 1050                 || lt128(
 1051                        zSig0,
 1052                        zSig1,
 1053                        LIT64( 0x0001FFFFFFFFFFFF ),
 1054                        LIT64( 0xFFFFFFFFFFFFFFFF )
 1055                    );
 1056             shift128ExtraRightJamming(
 1057                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 1058             zExp = 0;
 1059             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 1060             if ( roundNearestEven ) {
 1061                 increment = ( (sbits64) zSig2 < 0 );
 1062             }
 1063             else {
 1064                 if ( zSign ) {
 1065                     increment = ( roundingMode == float_round_down ) && zSig2;
 1066                 }
 1067                 else {
 1068                     increment = ( roundingMode == float_round_up ) && zSig2;
 1069                 }
 1070             }
 1071         }
 1072     }
 1073     if ( zSig2 ) float_set_inexact();
 1074     if ( increment ) {
 1075         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 1076         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 1077     }
 1078     else {
 1079         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 1080     }
 1081     return packFloat128( zSign, zExp, zSig0, zSig1 );
 1082 
 1083 }
 1084 
 1085 /*
 1086 -------------------------------------------------------------------------------
 1087 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 1088 and significand formed by the concatenation of `zSig0' and `zSig1', and
 1089 returns the proper quadruple-precision floating-point value corresponding
 1090 to the abstract input.  This routine is just like `roundAndPackFloat128'
 1091 except that the input significand has fewer bits and does not have to be
 1092 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 1093 point exponent.
 1094 -------------------------------------------------------------------------------
 1095 */
 1096 static float128
 1097  normalizeRoundAndPackFloat128(
 1098      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 1099 {
 1100     int8 shiftCount;
 1101     bits64 zSig2;
 1102 
 1103     if ( zSig0 == 0 ) {
 1104         zSig0 = zSig1;
 1105         zSig1 = 0;
 1106         zExp -= 64;
 1107     }
 1108     shiftCount = countLeadingZeros64( zSig0 ) - 15;
 1109     if ( 0 <= shiftCount ) {
 1110         zSig2 = 0;
 1111         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 1112     }
 1113     else {
 1114         shift128ExtraRightJamming(
 1115             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
 1116     }
 1117     zExp -= shiftCount;
 1118     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 1119 
 1120 }
 1121 
 1122 #endif
 1123 
 1124 /*
 1125 -------------------------------------------------------------------------------
 1126 Returns the result of converting the 32-bit two's complement integer `a'
 1127 to the single-precision floating-point format.  The conversion is performed
 1128 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1129 -------------------------------------------------------------------------------
 1130 */
 1131 float32 int32_to_float32( int32 a )
 1132 {
 1133     flag zSign;
 1134 
 1135     if ( a == 0 ) return 0;
 1136     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 1137     zSign = ( a < 0 );
 1138     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
 1139 
 1140 }
 1141 
 1142 /*
 1143 -------------------------------------------------------------------------------
 1144 Returns the result of converting the 32-bit two's complement integer `a'
 1145 to the double-precision floating-point format.  The conversion is performed
 1146 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1147 -------------------------------------------------------------------------------
 1148 */
 1149 float64 int32_to_float64( int32 a )
 1150 {
 1151     flag zSign;
 1152     uint32 absA;
 1153     int8 shiftCount;
 1154     bits64 zSig;
 1155 
 1156     if ( a == 0 ) return 0;
 1157     zSign = ( a < 0 );
 1158     absA = zSign ? - a : a;
 1159     shiftCount = countLeadingZeros32( absA ) + 21;
 1160     zSig = absA;
 1161     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
 1162 
 1163 }
 1164 
 1165 #ifdef FLOATX80
 1166 
 1167 /*
 1168 -------------------------------------------------------------------------------
 1169 Returns the result of converting the 32-bit two's complement integer `a'
 1170 to the extended double-precision floating-point format.  The conversion
 1171 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1172 Arithmetic.
 1173 -------------------------------------------------------------------------------
 1174 */
 1175 floatx80 int32_to_floatx80( int32 a )
 1176 {
 1177     flag zSign;
 1178     uint32 absA;
 1179     int8 shiftCount;
 1180     bits64 zSig;
 1181 
 1182     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 1183     zSign = ( a < 0 );
 1184     absA = zSign ? - a : a;
 1185     shiftCount = countLeadingZeros32( absA ) + 32;
 1186     zSig = absA;
 1187     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 1188 
 1189 }
 1190 
 1191 #endif
 1192 
 1193 #ifdef FLOAT128
 1194 
 1195 /*
 1196 -------------------------------------------------------------------------------
 1197 Returns the result of converting the 32-bit two's complement integer `a' to
 1198 the quadruple-precision floating-point format.  The conversion is performed
 1199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1200 -------------------------------------------------------------------------------
 1201 */
 1202 float128 int32_to_float128( int32 a )
 1203 {
 1204     flag zSign;
 1205     uint32 absA;
 1206     int8 shiftCount;
 1207     bits64 zSig0;
 1208 
 1209     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 1210     zSign = ( a < 0 );
 1211     absA = zSign ? - a : a;
 1212     shiftCount = countLeadingZeros32( absA ) + 17;
 1213     zSig0 = absA;
 1214     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
 1215 
 1216 }
 1217 
 1218 #endif
 1219 
 1220 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
 1221 /*
 1222 -------------------------------------------------------------------------------
 1223 Returns the result of converting the 64-bit two's complement integer `a'
 1224 to the single-precision floating-point format.  The conversion is performed
 1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1226 -------------------------------------------------------------------------------
 1227 */
 1228 float32 int64_to_float32( int64 a )
 1229 {
 1230     flag zSign;
 1231     uint64 absA;
 1232     int8 shiftCount;
 1233 
 1234     if ( a == 0 ) return 0;
 1235     zSign = ( a < 0 );
 1236     absA = zSign ? - a : a;
 1237     shiftCount = countLeadingZeros64( absA ) - 40;
 1238     if ( 0 <= shiftCount ) {
 1239         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
 1240     }
 1241     else {
 1242         shiftCount += 7;
 1243         if ( shiftCount < 0 ) {
 1244             shift64RightJamming( absA, - shiftCount, &absA );
 1245         }
 1246         else {
 1247             absA <<= shiftCount;
 1248         }
 1249         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
 1250     }
 1251 
 1252 }
 1253 
 1254 /*
 1255 -------------------------------------------------------------------------------
 1256 Returns the result of converting the 64-bit two's complement integer `a'
 1257 to the double-precision floating-point format.  The conversion is performed
 1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1259 -------------------------------------------------------------------------------
 1260 */
 1261 float64 int64_to_float64( int64 a )
 1262 {
 1263     flag zSign;
 1264 
 1265     if ( a == 0 ) return 0;
 1266     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
 1267         return packFloat64( 1, 0x43E, 0 );
 1268     }
 1269     zSign = ( a < 0 );
 1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
 1271 
 1272 }
 1273 
 1274 #ifdef FLOATX80
 1275 
 1276 /*
 1277 -------------------------------------------------------------------------------
 1278 Returns the result of converting the 64-bit two's complement integer `a'
 1279 to the extended double-precision floating-point format.  The conversion
 1280 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1281 Arithmetic.
 1282 -------------------------------------------------------------------------------
 1283 */
 1284 floatx80 int64_to_floatx80( int64 a )
 1285 {
 1286     flag zSign;
 1287     uint64 absA;
 1288     int8 shiftCount;
 1289 
 1290     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 1291     zSign = ( a < 0 );
 1292     absA = zSign ? - a : a;
 1293     shiftCount = countLeadingZeros64( absA );
 1294     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
 1295 
 1296 }
 1297 
 1298 #endif
 1299 
 1300 #ifdef FLOAT128
 1301 
 1302 /*
 1303 -------------------------------------------------------------------------------
 1304 Returns the result of converting the 64-bit two's complement integer `a' to
 1305 the quadruple-precision floating-point format.  The conversion is performed
 1306 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1307 -------------------------------------------------------------------------------
 1308 */
 1309 float128 int64_to_float128( int64 a )
 1310 {
 1311     flag zSign;
 1312     uint64 absA;
 1313     int8 shiftCount;
 1314     int32 zExp;
 1315     bits64 zSig0, zSig1;
 1316 
 1317     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 1318     zSign = ( a < 0 );
 1319     absA = zSign ? - a : a;
 1320     shiftCount = countLeadingZeros64( absA ) + 49;
 1321     zExp = 0x406E - shiftCount;
 1322     if ( 64 <= shiftCount ) {
 1323         zSig1 = 0;
 1324         zSig0 = absA;
 1325         shiftCount -= 64;
 1326     }
 1327     else {
 1328         zSig1 = absA;
 1329         zSig0 = 0;
 1330     }
 1331     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 1332     return packFloat128( zSign, zExp, zSig0, zSig1 );
 1333 
 1334 }
 1335 
 1336 #endif
 1337 #endif /* !SOFTFLOAT_FOR_GCC */
 1338 
 1339 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1340 /*
 1341 -------------------------------------------------------------------------------
 1342 Returns the result of converting the single-precision floating-point value
 1343 `a' to the 32-bit two's complement integer format.  The conversion is
 1344 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1345 Arithmetic---which means in particular that the conversion is rounded
 1346 according to the current rounding mode.  If `a' is a NaN, the largest
 1347 positive integer is returned.  Otherwise, if the conversion overflows, the
 1348 largest integer with the same sign as `a' is returned.
 1349 -------------------------------------------------------------------------------
 1350 */
 1351 int32 float32_to_int32( float32 a )
 1352 {
 1353     flag aSign;
 1354     int16 aExp, shiftCount;
 1355     bits32 aSig;
 1356     bits64 aSig64;
 1357 
 1358     aSig = extractFloat32Frac( a );
 1359     aExp = extractFloat32Exp( a );
 1360     aSign = extractFloat32Sign( a );
 1361     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
 1362     if ( aExp ) aSig |= 0x00800000;
 1363     shiftCount = 0xAF - aExp;
 1364     aSig64 = aSig;
 1365     aSig64 <<= 32;
 1366     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
 1367     return roundAndPackInt32( aSign, aSig64 );
 1368 
 1369 }
 1370 #endif /* !SOFTFLOAT_FOR_GCC */
 1371 
 1372 /*
 1373 -------------------------------------------------------------------------------
 1374 Returns the result of converting the single-precision floating-point value
 1375 `a' to the 32-bit two's complement integer format.  The conversion is
 1376 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1377 Arithmetic, except that the conversion is always rounded toward zero.
 1378 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 1379 the conversion overflows, the largest integer with the same sign as `a' is
 1380 returned.
 1381 -------------------------------------------------------------------------------
 1382 */
 1383 int32 float32_to_int32_round_to_zero( float32 a )
 1384 {
 1385     flag aSign;
 1386     int16 aExp, shiftCount;
 1387     bits32 aSig;
 1388     int32 z;
 1389 
 1390     aSig = extractFloat32Frac( a );
 1391     aExp = extractFloat32Exp( a );
 1392     aSign = extractFloat32Sign( a );
 1393     shiftCount = aExp - 0x9E;
 1394     if ( 0 <= shiftCount ) {
 1395         if ( a != 0xCF000000 ) {
 1396             float_raise( float_flag_invalid );
 1397             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 1398         }
 1399         return (sbits32) 0x80000000;
 1400     }
 1401     else if ( aExp <= 0x7E ) {
 1402         if ( aExp | aSig ) float_set_inexact();
 1403         return 0;
 1404     }
 1405     aSig = ( aSig | 0x00800000 )<<8;
 1406     z = aSig>>( - shiftCount );
 1407     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 1408         float_set_inexact();
 1409     }
 1410     if ( aSign ) z = - z;
 1411     return z;
 1412 
 1413 }
 1414 
 1415 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
 1416 /*
 1417 -------------------------------------------------------------------------------
 1418 Returns the result of converting the single-precision floating-point value
 1419 `a' to the 64-bit two's complement integer format.  The conversion is
 1420 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1421 Arithmetic---which means in particular that the conversion is rounded
 1422 according to the current rounding mode.  If `a' is a NaN, the largest
 1423 positive integer is returned.  Otherwise, if the conversion overflows, the
 1424 largest integer with the same sign as `a' is returned.
 1425 -------------------------------------------------------------------------------
 1426 */
 1427 int64 float32_to_int64( float32 a )
 1428 {
 1429     flag aSign;
 1430     int16 aExp, shiftCount;
 1431     bits32 aSig;
 1432     bits64 aSig64, aSigExtra;
 1433 
 1434     aSig = extractFloat32Frac( a );
 1435     aExp = extractFloat32Exp( a );
 1436     aSign = extractFloat32Sign( a );
 1437     shiftCount = 0xBE - aExp;
 1438     if ( shiftCount < 0 ) {
 1439         float_raise( float_flag_invalid );
 1440         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
 1441             return LIT64( 0x7FFFFFFFFFFFFFFF );
 1442         }
 1443         return (sbits64) LIT64( 0x8000000000000000 );
 1444     }
 1445     if ( aExp ) aSig |= 0x00800000;
 1446     aSig64 = aSig;
 1447     aSig64 <<= 40;
 1448     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
 1449     return roundAndPackInt64( aSign, aSig64, aSigExtra );
 1450 
 1451 }
 1452 
 1453 /*
 1454 -------------------------------------------------------------------------------
 1455 Returns the result of converting the single-precision floating-point value
 1456 `a' to the 64-bit two's complement integer format.  The conversion is
 1457 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1458 Arithmetic, except that the conversion is always rounded toward zero.  If
 1459 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 1460 conversion overflows, the largest integer with the same sign as `a' is
 1461 returned.
 1462 -------------------------------------------------------------------------------
 1463 */
 1464 int64 float32_to_int64_round_to_zero( float32 a )
 1465 {
 1466     flag aSign;
 1467     int16 aExp, shiftCount;
 1468     bits32 aSig;
 1469     bits64 aSig64;
 1470     int64 z;
 1471 
 1472     aSig = extractFloat32Frac( a );
 1473     aExp = extractFloat32Exp( a );
 1474     aSign = extractFloat32Sign( a );
 1475     shiftCount = aExp - 0xBE;
 1476     if ( 0 <= shiftCount ) {
 1477         if ( a != 0xDF000000 ) {
 1478             float_raise( float_flag_invalid );
 1479             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
 1480                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 1481             }
 1482         }
 1483         return (sbits64) LIT64( 0x8000000000000000 );
 1484     }
 1485     else if ( aExp <= 0x7E ) {
 1486         if ( aExp | aSig ) float_set_inexact();
 1487         return 0;
 1488     }
 1489     aSig64 = aSig | 0x00800000;
 1490     aSig64 <<= 40;
 1491     z = aSig64>>( - shiftCount );
 1492     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
 1493         float_set_inexact();
 1494     }
 1495     if ( aSign ) z = - z;
 1496     return z;
 1497 
 1498 }
 1499 #endif /* !SOFTFLOAT_FOR_GCC */
 1500 
 1501 /*
 1502 -------------------------------------------------------------------------------
 1503 Returns the result of converting the single-precision floating-point value
 1504 `a' to the double-precision floating-point format.  The conversion is
 1505 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1506 Arithmetic.
 1507 -------------------------------------------------------------------------------
 1508 */
 1509 float64 float32_to_float64( float32 a )
 1510 {
 1511     flag aSign;
 1512     int16 aExp;
 1513     bits32 aSig;
 1514 
 1515     aSig = extractFloat32Frac( a );
 1516     aExp = extractFloat32Exp( a );
 1517     aSign = extractFloat32Sign( a );
 1518     if ( aExp == 0xFF ) {
 1519         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 1520         return packFloat64( aSign, 0x7FF, 0 );
 1521     }
 1522     if ( aExp == 0 ) {
 1523         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1525         --aExp;
 1526     }
 1527     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 1528 
 1529 }
 1530 
 1531 #ifdef FLOATX80
 1532 
 1533 /*
 1534 -------------------------------------------------------------------------------
 1535 Returns the result of converting the single-precision floating-point value
 1536 `a' to the extended double-precision floating-point format.  The conversion
 1537 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1538 Arithmetic.
 1539 -------------------------------------------------------------------------------
 1540 */
 1541 floatx80 float32_to_floatx80( float32 a )
 1542 {
 1543     flag aSign;
 1544     int16 aExp;
 1545     bits32 aSig;
 1546 
 1547     aSig = extractFloat32Frac( a );
 1548     aExp = extractFloat32Exp( a );
 1549     aSign = extractFloat32Sign( a );
 1550     if ( aExp == 0xFF ) {
 1551         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 1552         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 1553     }
 1554     if ( aExp == 0 ) {
 1555         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 1556         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1557     }
 1558     aSig |= 0x00800000;
 1559     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 1560 
 1561 }
 1562 
 1563 #endif
 1564 
 1565 #ifdef FLOAT128
 1566 
 1567 /*
 1568 -------------------------------------------------------------------------------
 1569 Returns the result of converting the single-precision floating-point value
 1570 `a' to the double-precision floating-point format.  The conversion is
 1571 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1572 Arithmetic.
 1573 -------------------------------------------------------------------------------
 1574 */
 1575 float128 float32_to_float128( float32 a )
 1576 {
 1577     flag aSign;
 1578     int16 aExp;
 1579     bits32 aSig;
 1580 
 1581     aSig = extractFloat32Frac( a );
 1582     aExp = extractFloat32Exp( a );
 1583     aSign = extractFloat32Sign( a );
 1584     if ( aExp == 0xFF ) {
 1585         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
 1586         return packFloat128( aSign, 0x7FFF, 0, 0 );
 1587     }
 1588     if ( aExp == 0 ) {
 1589         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 1590         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1591         --aExp;
 1592     }
 1593     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
 1594 
 1595 }
 1596 
 1597 #endif
 1598 
 1599 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1600 /*
 1601 -------------------------------------------------------------------------------
 1602 Rounds the single-precision floating-point value `a' to an integer, and
 1603 returns the result as a single-precision floating-point value.  The
 1604 operation is performed according to the IEC/IEEE Standard for Binary
 1605 Floating-Point Arithmetic.
 1606 -------------------------------------------------------------------------------
 1607 */
 1608 float32 float32_round_to_int( float32 a )
 1609 {
 1610     flag aSign;
 1611     int16 aExp;
 1612     bits32 lastBitMask, roundBitsMask;
 1613     int8 roundingMode;
 1614     float32 z;
 1615 
 1616     aExp = extractFloat32Exp( a );
 1617     if ( 0x96 <= aExp ) {
 1618         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 1619             return propagateFloat32NaN( a, a );
 1620         }
 1621         return a;
 1622     }
 1623     if ( aExp <= 0x7E ) {
 1624         if ( (bits32) ( a<<1 ) == 0 ) return a;
 1625         float_set_inexact();
 1626         aSign = extractFloat32Sign( a );
 1627         switch ( float_rounding_mode() ) {
 1628          case float_round_nearest_even:
 1629             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 1630                 return packFloat32( aSign, 0x7F, 0 );
 1631             }
 1632             break;
 1633          case float_round_down:
 1634             return aSign ? 0xBF800000 : 0;
 1635          case float_round_up:
 1636             return aSign ? 0x80000000 : 0x3F800000;
 1637         }
 1638         return packFloat32( aSign, 0, 0 );
 1639     }
 1640     lastBitMask = 1;
 1641     lastBitMask <<= 0x96 - aExp;
 1642     roundBitsMask = lastBitMask - 1;
 1643     z = a;
 1644     roundingMode = float_rounding_mode();
 1645     if ( roundingMode == float_round_nearest_even ) {
 1646         z += lastBitMask>>1;
 1647         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
 1648     }
 1649     else if ( roundingMode != float_round_to_zero ) {
 1650         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 1651             z += roundBitsMask;
 1652         }
 1653     }
 1654     z &= ~ roundBitsMask;
 1655     if ( z != a ) float_set_inexact();
 1656     return z;
 1657 
 1658 }
 1659 #endif /* !SOFTFLOAT_FOR_GCC */
 1660 
 1661 /*
 1662 -------------------------------------------------------------------------------
 1663 Returns the result of adding the absolute values of the single-precision
 1664 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 1665 before being returned.  `zSign' is ignored if the result is a NaN.
 1666 The addition is performed according to the IEC/IEEE Standard for Binary
 1667 Floating-Point Arithmetic.
 1668 -------------------------------------------------------------------------------
 1669 */
 1670 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
 1671 {
 1672     int16 aExp, bExp, zExp;
 1673     bits32 aSig, bSig, zSig;
 1674     int16 expDiff;
 1675 
 1676     aSig = extractFloat32Frac( a );
 1677     aExp = extractFloat32Exp( a );
 1678     bSig = extractFloat32Frac( b );
 1679     bExp = extractFloat32Exp( b );
 1680     expDiff = aExp - bExp;
 1681     aSig <<= 6;
 1682     bSig <<= 6;
 1683     if ( 0 < expDiff ) {
 1684         if ( aExp == 0xFF ) {
 1685             if ( aSig ) return propagateFloat32NaN( a, b );
 1686             return a;
 1687         }
 1688         if ( bExp == 0 ) {
 1689             --expDiff;
 1690         }
 1691         else {
 1692             bSig |= 0x20000000;
 1693         }
 1694         shift32RightJamming( bSig, expDiff, &bSig );
 1695         zExp = aExp;
 1696     }
 1697     else if ( expDiff < 0 ) {
 1698         if ( bExp == 0xFF ) {
 1699             if ( bSig ) return propagateFloat32NaN( a, b );
 1700             return packFloat32( zSign, 0xFF, 0 );
 1701         }
 1702         if ( aExp == 0 ) {
 1703             ++expDiff;
 1704         }
 1705         else {
 1706             aSig |= 0x20000000;
 1707         }
 1708         shift32RightJamming( aSig, - expDiff, &aSig );
 1709         zExp = bExp;
 1710     }
 1711     else {
 1712         if ( aExp == 0xFF ) {
 1713             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
 1714             return a;
 1715         }
 1716         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
 1717         zSig = 0x40000000 + aSig + bSig;
 1718         zExp = aExp;
 1719         goto roundAndPack;
 1720     }
 1721     aSig |= 0x20000000;
 1722     zSig = ( aSig + bSig )<<1;
 1723     --zExp;
 1724     if ( (sbits32) zSig < 0 ) {
 1725         zSig = aSig + bSig;
 1726         ++zExp;
 1727     }
 1728  roundAndPack:
 1729     return roundAndPackFloat32( zSign, zExp, zSig );
 1730 
 1731 }
 1732 
 1733 /*
 1734 -------------------------------------------------------------------------------
 1735 Returns the result of subtracting the absolute values of the single-
 1736 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 1737 difference is negated before being returned.  `zSign' is ignored if the
 1738 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 1739 Standard for Binary Floating-Point Arithmetic.
 1740 -------------------------------------------------------------------------------
 1741 */
 1742 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
 1743 {
 1744     int16 aExp, bExp, zExp;
 1745     bits32 aSig, bSig, zSig;
 1746     int16 expDiff;
 1747 
 1748     aSig = extractFloat32Frac( a );
 1749     aExp = extractFloat32Exp( a );
 1750     bSig = extractFloat32Frac( b );
 1751     bExp = extractFloat32Exp( b );
 1752     expDiff = aExp - bExp;
 1753     aSig <<= 7;
 1754     bSig <<= 7;
 1755     if ( 0 < expDiff ) goto aExpBigger;
 1756     if ( expDiff < 0 ) goto bExpBigger;
 1757     if ( aExp == 0xFF ) {
 1758         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
 1759         float_raise( float_flag_invalid );
 1760         return float32_default_nan;
 1761     }
 1762     if ( aExp == 0 ) {
 1763         aExp = 1;
 1764         bExp = 1;
 1765     }
 1766     if ( bSig < aSig ) goto aBigger;
 1767     if ( aSig < bSig ) goto bBigger;
 1768     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
 1769  bExpBigger:
 1770     if ( bExp == 0xFF ) {
 1771         if ( bSig ) return propagateFloat32NaN( a, b );
 1772         return packFloat32( zSign ^ 1, 0xFF, 0 );
 1773     }
 1774     if ( aExp == 0 ) {
 1775         ++expDiff;
 1776     }
 1777     else {
 1778         aSig |= 0x40000000;
 1779     }
 1780     shift32RightJamming( aSig, - expDiff, &aSig );
 1781     bSig |= 0x40000000;
 1782  bBigger:
 1783     zSig = bSig - aSig;
 1784     zExp = bExp;
 1785     zSign ^= 1;
 1786     goto normalizeRoundAndPack;
 1787  aExpBigger:
 1788     if ( aExp == 0xFF ) {
 1789         if ( aSig ) return propagateFloat32NaN( a, b );
 1790         return a;
 1791     }
 1792     if ( bExp == 0 ) {
 1793         --expDiff;
 1794     }
 1795     else {
 1796         bSig |= 0x40000000;
 1797     }
 1798     shift32RightJamming( bSig, expDiff, &bSig );
 1799     aSig |= 0x40000000;
 1800  aBigger:
 1801     zSig = aSig - bSig;
 1802     zExp = aExp;
 1803  normalizeRoundAndPack:
 1804     --zExp;
 1805     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
 1806 
 1807 }
 1808 
 1809 /*
 1810 -------------------------------------------------------------------------------
 1811 Returns the result of adding the single-precision floating-point values `a'
 1812 and `b'.  The operation is performed according to the IEC/IEEE Standard for
 1813 Binary Floating-Point Arithmetic.
 1814 -------------------------------------------------------------------------------
 1815 */
 1816 float32 float32_add( float32 a, float32 b )
 1817 {
 1818     flag aSign, bSign;
 1819 
 1820     aSign = extractFloat32Sign( a );
 1821     bSign = extractFloat32Sign( b );
 1822     if ( aSign == bSign ) {
 1823         return addFloat32Sigs( a, b, aSign );
 1824     }
 1825     else {
 1826         return subFloat32Sigs( a, b, aSign );
 1827     }
 1828 
 1829 }
 1830 
 1831 /*
 1832 -------------------------------------------------------------------------------
 1833 Returns the result of subtracting the single-precision floating-point values
 1834 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 1835 for Binary Floating-Point Arithmetic.
 1836 -------------------------------------------------------------------------------
 1837 */
 1838 float32 float32_sub( float32 a, float32 b )
 1839 {
 1840     flag aSign, bSign;
 1841 
 1842     aSign = extractFloat32Sign( a );
 1843     bSign = extractFloat32Sign( b );
 1844     if ( aSign == bSign ) {
 1845         return subFloat32Sigs( a, b, aSign );
 1846     }
 1847     else {
 1848         return addFloat32Sigs( a, b, aSign );
 1849     }
 1850 
 1851 }
 1852 
 1853 /*
 1854 -------------------------------------------------------------------------------
 1855 Returns the result of multiplying the single-precision floating-point values
 1856 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 1857 for Binary Floating-Point Arithmetic.
 1858 -------------------------------------------------------------------------------
 1859 */
 1860 float32 float32_mul( float32 a, float32 b )
 1861 {
 1862     flag aSign, bSign, zSign;
 1863     int16 aExp, bExp, zExp;
 1864     bits32 aSig, bSig;
 1865     bits64 zSig64;
 1866     bits32 zSig;
 1867 
 1868     aSig = extractFloat32Frac( a );
 1869     aExp = extractFloat32Exp( a );
 1870     aSign = extractFloat32Sign( a );
 1871     bSig = extractFloat32Frac( b );
 1872     bExp = extractFloat32Exp( b );
 1873     bSign = extractFloat32Sign( b );
 1874     zSign = aSign ^ bSign;
 1875     if ( aExp == 0xFF ) {
 1876         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
 1877             return propagateFloat32NaN( a, b );
 1878         }
 1879         if ( ( bExp | bSig ) == 0 ) {
 1880             float_raise( float_flag_invalid );
 1881             return float32_default_nan;
 1882         }
 1883         return packFloat32( zSign, 0xFF, 0 );
 1884     }
 1885     if ( bExp == 0xFF ) {
 1886         if ( bSig ) return propagateFloat32NaN( a, b );
 1887         if ( ( aExp | aSig ) == 0 ) {
 1888             float_raise( float_flag_invalid );
 1889             return float32_default_nan;
 1890         }
 1891         return packFloat32( zSign, 0xFF, 0 );
 1892     }
 1893     if ( aExp == 0 ) {
 1894         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
 1895         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1896     }
 1897     if ( bExp == 0 ) {
 1898         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
 1899         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 1900     }
 1901     zExp = aExp + bExp - 0x7F;
 1902     aSig = ( aSig | 0x00800000 )<<7;
 1903     bSig = ( bSig | 0x00800000 )<<8;
 1904     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
 1905     zSig = zSig64;
 1906     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
 1907         zSig <<= 1;
 1908         --zExp;
 1909     }
 1910     return roundAndPackFloat32( zSign, zExp, zSig );
 1911 
 1912 }
 1913 
 1914 /*
 1915 -------------------------------------------------------------------------------
 1916 Returns the result of dividing the single-precision floating-point value `a'
 1917 by the corresponding value `b'.  The operation is performed according to the
 1918 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1919 -------------------------------------------------------------------------------
 1920 */
 1921 float32 float32_div( float32 a, float32 b )
 1922 {
 1923     flag aSign, bSign, zSign;
 1924     int16 aExp, bExp, zExp;
 1925     bits32 aSig, bSig, zSig;
 1926 
 1927     aSig = extractFloat32Frac( a );
 1928     aExp = extractFloat32Exp( a );
 1929     aSign = extractFloat32Sign( a );
 1930     bSig = extractFloat32Frac( b );
 1931     bExp = extractFloat32Exp( b );
 1932     bSign = extractFloat32Sign( b );
 1933     zSign = aSign ^ bSign;
 1934     if ( aExp == 0xFF ) {
 1935         if ( aSig ) return propagateFloat32NaN( a, b );
 1936         if ( bExp == 0xFF ) {
 1937             if ( bSig ) return propagateFloat32NaN( a, b );
 1938             float_raise( float_flag_invalid );
 1939             return float32_default_nan;
 1940         }
 1941         return packFloat32( zSign, 0xFF, 0 );
 1942     }
 1943     if ( bExp == 0xFF ) {
 1944         if ( bSig ) return propagateFloat32NaN( a, b );
 1945         return packFloat32( zSign, 0, 0 );
 1946     }
 1947     if ( bExp == 0 ) {
 1948         if ( bSig == 0 ) {
 1949             if ( ( aExp | aSig ) == 0 ) {
 1950                 float_raise( float_flag_invalid );
 1951                 return float32_default_nan;
 1952             }
 1953             float_raise( float_flag_divbyzero );
 1954             return packFloat32( zSign, 0xFF, 0 );
 1955         }
 1956         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 1957     }
 1958     if ( aExp == 0 ) {
 1959         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
 1960         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1961     }
 1962     zExp = aExp - bExp + 0x7D;
 1963     aSig = ( aSig | 0x00800000 )<<7;
 1964     bSig = ( bSig | 0x00800000 )<<8;
 1965     if ( bSig <= ( aSig + aSig ) ) {
 1966         aSig >>= 1;
 1967         ++zExp;
 1968     }
 1969     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
 1970     if ( ( zSig & 0x3F ) == 0 ) {
 1971         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
 1972     }
 1973     return roundAndPackFloat32( zSign, zExp, zSig );
 1974 
 1975 }
 1976 
 1977 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1978 /*
 1979 -------------------------------------------------------------------------------
 1980 Returns the remainder of the single-precision floating-point value `a'
 1981 with respect to the corresponding value `b'.  The operation is performed
 1982 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1983 -------------------------------------------------------------------------------
 1984 */
 1985 float32 float32_rem( float32 a, float32 b )
 1986 {
 1987     flag aSign, bSign, zSign;
 1988     int16 aExp, bExp, expDiff;
 1989     bits32 aSig, bSig;
 1990     bits32 q;
 1991     bits64 aSig64, bSig64, q64;
 1992     bits32 alternateASig;
 1993     sbits32 sigMean;
 1994 
 1995     aSig = extractFloat32Frac( a );
 1996     aExp = extractFloat32Exp( a );
 1997     aSign = extractFloat32Sign( a );
 1998     bSig = extractFloat32Frac( b );
 1999     bExp = extractFloat32Exp( b );
 2000     bSign = extractFloat32Sign( b );
 2001     if ( aExp == 0xFF ) {
 2002         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
 2003             return propagateFloat32NaN( a, b );
 2004         }
 2005         float_raise( float_flag_invalid );
 2006         return float32_default_nan;
 2007     }
 2008     if ( bExp == 0xFF ) {
 2009         if ( bSig ) return propagateFloat32NaN( a, b );
 2010         return a;
 2011     }
 2012     if ( bExp == 0 ) {
 2013         if ( bSig == 0 ) {
 2014             float_raise( float_flag_invalid );
 2015             return float32_default_nan;
 2016         }
 2017         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 2018     }
 2019     if ( aExp == 0 ) {
 2020         if ( aSig == 0 ) return a;
 2021         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 2022     }
 2023     expDiff = aExp - bExp;
 2024     aSig |= 0x00800000;
 2025     bSig |= 0x00800000;
 2026     if ( expDiff < 32 ) {
 2027         aSig <<= 8;
 2028         bSig <<= 8;
 2029         if ( expDiff < 0 ) {
 2030             if ( expDiff < -1 ) return a;
 2031             aSig >>= 1;
 2032         }
 2033         q = ( bSig <= aSig );
 2034         if ( q ) aSig -= bSig;
 2035         if ( 0 < expDiff ) {
 2036             q = ( ( (bits64) aSig )<<32 ) / bSig;
 2037             q >>= 32 - expDiff;
 2038             bSig >>= 2;
 2039             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
 2040         }
 2041         else {
 2042             aSig >>= 2;
 2043             bSig >>= 2;
 2044         }
 2045     }
 2046     else {
 2047         if ( bSig <= aSig ) aSig -= bSig;
 2048         aSig64 = ( (bits64) aSig )<<40;
 2049         bSig64 = ( (bits64) bSig )<<40;
 2050         expDiff -= 64;
 2051         while ( 0 < expDiff ) {
 2052             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
 2053             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
 2054             aSig64 = - ( ( bSig * q64 )<<38 );
 2055             expDiff -= 62;
 2056         }
 2057         expDiff += 64;
 2058         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
 2059         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
 2060         q = q64>>( 64 - expDiff );
 2061         bSig <<= 6;
 2062         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
 2063     }
 2064     do {
 2065         alternateASig = aSig;
 2066         ++q;
 2067         aSig -= bSig;
 2068     } while ( 0 <= (sbits32) aSig );
 2069     sigMean = aSig + alternateASig;
 2070     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
 2071         aSig = alternateASig;
 2072     }
 2073     zSign = ( (sbits32) aSig < 0 );
 2074     if ( zSign ) aSig = - aSig;
 2075     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
 2076 
 2077 }
 2078 #endif /* !SOFTFLOAT_FOR_GCC */
 2079 
 2080 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2081 /*
 2082 -------------------------------------------------------------------------------
 2083 Returns the square root of the single-precision floating-point value `a'.
 2084 The operation is performed according to the IEC/IEEE Standard for Binary
 2085 Floating-Point Arithmetic.
 2086 -------------------------------------------------------------------------------
 2087 */
 2088 float32 float32_sqrt( float32 a )
 2089 {
 2090     flag aSign;
 2091     int16 aExp, zExp;
 2092     bits32 aSig, zSig;
 2093     bits64 rem, term;
 2094 
 2095     aSig = extractFloat32Frac( a );
 2096     aExp = extractFloat32Exp( a );
 2097     aSign = extractFloat32Sign( a );
 2098     if ( aExp == 0xFF ) {
 2099         if ( aSig ) return propagateFloat32NaN( a, 0 );
 2100         if ( ! aSign ) return a;
 2101         float_raise( float_flag_invalid );
 2102         return float32_default_nan;
 2103     }
 2104     if ( aSign ) {
 2105         if ( ( aExp | aSig ) == 0 ) return a;
 2106         float_raise( float_flag_invalid );
 2107         return float32_default_nan;
 2108     }
 2109     if ( aExp == 0 ) {
 2110         if ( aSig == 0 ) return 0;
 2111         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 2112     }
 2113     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
 2114     aSig = ( aSig | 0x00800000 )<<8;
 2115     zSig = estimateSqrt32( aExp, aSig ) + 2;
 2116     if ( ( zSig & 0x7F ) <= 5 ) {
 2117         if ( zSig < 2 ) {
 2118             zSig = 0x7FFFFFFF;
 2119             goto roundAndPack;
 2120         }
 2121         aSig >>= aExp & 1;
 2122         term = ( (bits64) zSig ) * zSig;
 2123         rem = ( ( (bits64) aSig )<<32 ) - term;
 2124         while ( (sbits64) rem < 0 ) {
 2125             --zSig;
 2126             rem += ( ( (bits64) zSig )<<1 ) | 1;
 2127         }
 2128         zSig |= ( rem != 0 );
 2129     }
 2130     shift32RightJamming( zSig, 1, &zSig );
 2131  roundAndPack:
 2132     return roundAndPackFloat32( 0, zExp, zSig );
 2133 
 2134 }
 2135 #endif /* !SOFTFLOAT_FOR_GCC */
 2136 
 2137 /*
 2138 -------------------------------------------------------------------------------
 2139 Returns 1 if the single-precision floating-point value `a' is equal to
 2140 the corresponding value `b', and 0 otherwise.  The comparison is performed
 2141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2142 -------------------------------------------------------------------------------
 2143 */
 2144 flag float32_eq( float32 a, float32 b )
 2145 {
 2146 
 2147     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2148          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2149        ) {
 2150         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2151             float_raise( float_flag_invalid );
 2152         }
 2153         return 0;
 2154     }
 2155     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2156 
 2157 }
 2158 
 2159 /*
 2160 -------------------------------------------------------------------------------
 2161 Returns 1 if the single-precision floating-point value `a' is less than
 2162 or equal to the corresponding value `b', and 0 otherwise.  The comparison
 2163 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 2164 Arithmetic.
 2165 -------------------------------------------------------------------------------
 2166 */
 2167 flag float32_le( float32 a, float32 b )
 2168 {
 2169     flag aSign, bSign;
 2170 
 2171     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2172          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2173        ) {
 2174         float_raise( float_flag_invalid );
 2175         return 0;
 2176     }
 2177     aSign = extractFloat32Sign( a );
 2178     bSign = extractFloat32Sign( b );
 2179     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2180     return ( a == b ) || ( aSign ^ ( a < b ) );
 2181 
 2182 }
 2183 
 2184 /*
 2185 -------------------------------------------------------------------------------
 2186 Returns 1 if the single-precision floating-point value `a' is less than
 2187 the corresponding value `b', and 0 otherwise.  The comparison is performed
 2188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2189 -------------------------------------------------------------------------------
 2190 */
 2191 flag float32_lt( float32 a, float32 b )
 2192 {
 2193     flag aSign, bSign;
 2194 
 2195     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2196          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2197        ) {
 2198         float_raise( float_flag_invalid );
 2199         return 0;
 2200     }
 2201     aSign = extractFloat32Sign( a );
 2202     bSign = extractFloat32Sign( b );
 2203     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
 2204     return ( a != b ) && ( aSign ^ ( a < b ) );
 2205 
 2206 }
 2207 
 2208 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2209 /*
 2210 -------------------------------------------------------------------------------
 2211 Returns 1 if the single-precision floating-point value `a' is equal to
 2212 the corresponding value `b', and 0 otherwise.  The invalid exception is
 2213 raised if either operand is a NaN.  Otherwise, the comparison is performed
 2214 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2215 -------------------------------------------------------------------------------
 2216 */
 2217 flag float32_eq_signaling( float32 a, float32 b )
 2218 {
 2219 
 2220     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2221          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2222        ) {
 2223         float_raise( float_flag_invalid );
 2224         return 0;
 2225     }
 2226     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2227 
 2228 }
 2229 
 2230 /*
 2231 -------------------------------------------------------------------------------
 2232 Returns 1 if the single-precision floating-point value `a' is less than or
 2233 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 2234 cause an exception.  Otherwise, the comparison is performed according to the
 2235 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2236 -------------------------------------------------------------------------------
 2237 */
 2238 flag float32_le_quiet( float32 a, float32 b )
 2239 {
 2240     flag aSign, bSign;
 2241 
 2242     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2243          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2244        ) {
 2245         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2246             float_raise( float_flag_invalid );
 2247         }
 2248         return 0;
 2249     }
 2250     aSign = extractFloat32Sign( a );
 2251     bSign = extractFloat32Sign( b );
 2252     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2253     return ( a == b ) || ( aSign ^ ( a < b ) );
 2254 
 2255 }
 2256 
 2257 /*
 2258 -------------------------------------------------------------------------------
 2259 Returns 1 if the single-precision floating-point value `a' is less than
 2260 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 2261 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 2262 Standard for Binary Floating-Point Arithmetic.
 2263 -------------------------------------------------------------------------------
 2264 */
 2265 flag float32_lt_quiet( float32 a, float32 b )
 2266 {
 2267     flag aSign, bSign;
 2268 
 2269     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2270          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2271        ) {
 2272         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2273             float_raise( float_flag_invalid );
 2274         }
 2275         return 0;
 2276     }
 2277     aSign = extractFloat32Sign( a );
 2278     bSign = extractFloat32Sign( b );
 2279     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
 2280     return ( a != b ) && ( aSign ^ ( a < b ) );
 2281 
 2282 }
 2283 #endif /* !SOFTFLOAT_FOR_GCC */
 2284 
 2285 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2286 /*
 2287 -------------------------------------------------------------------------------
 2288 Returns the result of converting the double-precision floating-point value
 2289 `a' to the 32-bit two's complement integer format.  The conversion is
 2290 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2291 Arithmetic---which means in particular that the conversion is rounded
 2292 according to the current rounding mode.  If `a' is a NaN, the largest
 2293 positive integer is returned.  Otherwise, if the conversion overflows, the
 2294 largest integer with the same sign as `a' is returned.
 2295 -------------------------------------------------------------------------------
 2296 */
 2297 int32 float64_to_int32( float64 a )
 2298 {
 2299     flag aSign;
 2300     int16 aExp, shiftCount;
 2301     bits64 aSig;
 2302 
 2303     aSig = extractFloat64Frac( a );
 2304     aExp = extractFloat64Exp( a );
 2305     aSign = extractFloat64Sign( a );
 2306     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 2307     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2308     shiftCount = 0x42C - aExp;
 2309     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
 2310     return roundAndPackInt32( aSign, aSig );
 2311 
 2312 }
 2313 #endif /* !SOFTFLOAT_FOR_GCC */
 2314 
 2315 /*
 2316 -------------------------------------------------------------------------------
 2317 Returns the result of converting the double-precision floating-point value
 2318 `a' to the 32-bit two's complement integer format.  The conversion is
 2319 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2320 Arithmetic, except that the conversion is always rounded toward zero.
 2321 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 2322 the conversion overflows, the largest integer with the same sign as `a' is
 2323 returned.
 2324 -------------------------------------------------------------------------------
 2325 */
 2326 int32 float64_to_int32_round_to_zero( float64 a )
 2327 {
 2328     flag aSign;
 2329     int16 aExp, shiftCount;
 2330     bits64 aSig, savedASig;
 2331     int32 z;
 2332 
 2333     aSig = extractFloat64Frac( a );
 2334     aExp = extractFloat64Exp( a );
 2335     aSign = extractFloat64Sign( a );
 2336     if ( 0x41E < aExp ) {
 2337         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 2338         goto invalid;
 2339     }
 2340     else if ( aExp < 0x3FF ) {
 2341         if ( aExp || aSig ) float_set_inexact();
 2342         return 0;
 2343     }
 2344     aSig |= LIT64( 0x0010000000000000 );
 2345     shiftCount = 0x433 - aExp;
 2346     savedASig = aSig;
 2347     aSig >>= shiftCount;
 2348     z = aSig;
 2349     if ( aSign ) z = - z;
 2350     if ( ( z < 0 ) ^ aSign ) {
 2351  invalid:
 2352         float_raise( float_flag_invalid );
 2353         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 2354     }
 2355     if ( ( aSig<<shiftCount ) != savedASig ) {
 2356         float_set_inexact();
 2357     }
 2358     return z;
 2359 
 2360 }
 2361 
 2362 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2363 /*
 2364 -------------------------------------------------------------------------------
 2365 Returns the result of converting the double-precision floating-point value
 2366 `a' to the 64-bit two's complement integer format.  The conversion is
 2367 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2368 Arithmetic---which means in particular that the conversion is rounded
 2369 according to the current rounding mode.  If `a' is a NaN, the largest
 2370 positive integer is returned.  Otherwise, if the conversion overflows, the
 2371 largest integer with the same sign as `a' is returned.
 2372 -------------------------------------------------------------------------------
 2373 */
 2374 int64 float64_to_int64( float64 a )
 2375 {
 2376     flag aSign;
 2377     int16 aExp, shiftCount;
 2378     bits64 aSig, aSigExtra;
 2379 
 2380     aSig = extractFloat64Frac( a );
 2381     aExp = extractFloat64Exp( a );
 2382     aSign = extractFloat64Sign( a );
 2383     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2384     shiftCount = 0x433 - aExp;
 2385     if ( shiftCount <= 0 ) {
 2386         if ( 0x43E < aExp ) {
 2387             float_raise( float_flag_invalid );
 2388             if (    ! aSign
 2389                  || (    ( aExp == 0x7FF )
 2390                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
 2391                ) {
 2392                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 2393             }
 2394             return (sbits64) LIT64( 0x8000000000000000 );
 2395         }
 2396         aSigExtra = 0;
 2397         aSig <<= - shiftCount;
 2398     }
 2399     else {
 2400         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
 2401     }
 2402     return roundAndPackInt64( aSign, aSig, aSigExtra );
 2403 
 2404 }
 2405 
 2406 /*
 2407 -------------------------------------------------------------------------------
 2408 Returns the result of converting the double-precision floating-point value
 2409 `a' to the 64-bit two's complement integer format.  The conversion is
 2410 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2411 Arithmetic, except that the conversion is always rounded toward zero.
 2412 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 2413 the conversion overflows, the largest integer with the same sign as `a' is
 2414 returned.
 2415 -------------------------------------------------------------------------------
 2416 */
 2417 int64 float64_to_int64_round_to_zero( float64 a )
 2418 {
 2419     flag aSign;
 2420     int16 aExp, shiftCount;
 2421     bits64 aSig;
 2422     int64 z;
 2423 
 2424     aSig = extractFloat64Frac( a );
 2425     aExp = extractFloat64Exp( a );
 2426     aSign = extractFloat64Sign( a );
 2427     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2428     shiftCount = aExp - 0x433;
 2429     if ( 0 <= shiftCount ) {
 2430         if ( 0x43E <= aExp ) {
 2431             if ( a != LIT64( 0xC3E0000000000000 ) ) {
 2432                 float_raise( float_flag_invalid );
 2433                 if (    ! aSign
 2434                      || (    ( aExp == 0x7FF )
 2435                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
 2436                    ) {
 2437                     return LIT64( 0x7FFFFFFFFFFFFFFF );
 2438                 }
 2439             }
 2440             return (sbits64) LIT64( 0x8000000000000000 );
 2441         }
 2442         z = aSig<<shiftCount;
 2443     }
 2444     else {
 2445         if ( aExp < 0x3FE ) {
 2446             if ( aExp | aSig ) float_set_inexact();
 2447             return 0;
 2448         }
 2449         z = aSig>>( - shiftCount );
 2450         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
 2451             float_set_inexact();
 2452         }
 2453     }
 2454     if ( aSign ) z = - z;
 2455     return z;
 2456 
 2457 }
 2458 #endif /* !SOFTFLOAT_FOR_GCC */
 2459 
 2460 /*
 2461 -------------------------------------------------------------------------------
 2462 Returns the result of converting the double-precision floating-point value
 2463 `a' to the single-precision floating-point format.  The conversion is
 2464 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2465 Arithmetic.
 2466 -------------------------------------------------------------------------------
 2467 */
 2468 float32 float64_to_float32( float64 a )
 2469 {
 2470     flag aSign;
 2471     int16 aExp;
 2472     bits64 aSig;
 2473     bits32 zSig;
 2474 
 2475     aSig = extractFloat64Frac( a );
 2476     aExp = extractFloat64Exp( a );
 2477     aSign = extractFloat64Sign( a );
 2478     if ( aExp == 0x7FF ) {
 2479         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
 2480         return packFloat32( aSign, 0xFF, 0 );
 2481     }
 2482     shift64RightJamming( aSig, 22, &aSig );
 2483     zSig = aSig;
 2484     if ( aExp || zSig ) {
 2485         zSig |= 0x40000000;
 2486         aExp -= 0x381;
 2487     }
 2488     return roundAndPackFloat32( aSign, aExp, zSig );
 2489 
 2490 }
 2491 
 2492 #ifdef FLOATX80
 2493 
 2494 /*
 2495 -------------------------------------------------------------------------------
 2496 Returns the result of converting the double-precision floating-point value
 2497 `a' to the extended double-precision floating-point format.  The conversion
 2498 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 2499 Arithmetic.
 2500 -------------------------------------------------------------------------------
 2501 */
 2502 floatx80 float64_to_floatx80( float64 a )
 2503 {
 2504     flag aSign;
 2505     int16 aExp;
 2506     bits64 aSig;
 2507 
 2508     aSig = extractFloat64Frac( a );
 2509     aExp = extractFloat64Exp( a );
 2510     aSign = extractFloat64Sign( a );
 2511     if ( aExp == 0x7FF ) {
 2512         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
 2513         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 2514     }
 2515     if ( aExp == 0 ) {
 2516         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 2517         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2518     }
 2519     return
 2520         packFloatx80(
 2521             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
 2522 
 2523 }
 2524 
 2525 #endif
 2526 
 2527 #ifdef FLOAT128
 2528 
 2529 /*
 2530 -------------------------------------------------------------------------------
 2531 Returns the result of converting the double-precision floating-point value
 2532 `a' to the quadruple-precision floating-point format.  The conversion is
 2533 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2534 Arithmetic.
 2535 -------------------------------------------------------------------------------
 2536 */
 2537 float128 float64_to_float128( float64 a )
 2538 {
 2539     flag aSign;
 2540     int16 aExp;
 2541     bits64 aSig, zSig0, zSig1;
 2542 
 2543     aSig = extractFloat64Frac( a );
 2544     aExp = extractFloat64Exp( a );
 2545     aSign = extractFloat64Sign( a );
 2546     if ( aExp == 0x7FF ) {
 2547         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
 2548         return packFloat128( aSign, 0x7FFF, 0, 0 );
 2549     }
 2550     if ( aExp == 0 ) {
 2551         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 2552         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2553         --aExp;
 2554     }
 2555     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
 2556     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
 2557 
 2558 }
 2559 
 2560 #endif
 2561 
 2562 #ifndef SOFTFLOAT_FOR_GCC
 2563 /*
 2564 -------------------------------------------------------------------------------
 2565 Rounds the double-precision floating-point value `a' to an integer, and
 2566 returns the result as a double-precision floating-point value.  The
 2567 operation is performed according to the IEC/IEEE Standard for Binary
 2568 Floating-Point Arithmetic.
 2569 -------------------------------------------------------------------------------
 2570 */
 2571 float64 float64_round_to_int( float64 a )
 2572 {
 2573     flag aSign;
 2574     int16 aExp;
 2575     bits64 lastBitMask, roundBitsMask;
 2576     int8 roundingMode;
 2577     float64 z;
 2578 
 2579     aExp = extractFloat64Exp( a );
 2580     if ( 0x433 <= aExp ) {
 2581         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
 2582             return propagateFloat64NaN( a, a );
 2583         }
 2584         return a;
 2585     }
 2586     if ( aExp < 0x3FF ) {
 2587         if ( (bits64) ( a<<1 ) == 0 ) return a;
 2588         float_set_inexact();
 2589         aSign = extractFloat64Sign( a );
 2590         switch ( float_rounding_mode() ) {
 2591          case float_round_nearest_even:
 2592             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
 2593                 return packFloat64( aSign, 0x3FF, 0 );
 2594             }
 2595             break;
 2596          case float_round_down:
 2597             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
 2598          case float_round_up:
 2599             return
 2600             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
 2601         }
 2602         return packFloat64( aSign, 0, 0 );
 2603     }
 2604     lastBitMask = 1;
 2605     lastBitMask <<= 0x433 - aExp;
 2606     roundBitsMask = lastBitMask - 1;
 2607     z = a;
 2608     roundingMode = float_rounding_mode();
 2609     if ( roundingMode == float_round_nearest_even ) {
 2610         z += lastBitMask>>1;
 2611         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
 2612     }
 2613     else if ( roundingMode != float_round_to_zero ) {
 2614         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 2615             z += roundBitsMask;
 2616         }
 2617     }
 2618     z &= ~ roundBitsMask;
 2619     if ( z != a ) float_set_inexact();
 2620     return z;
 2621 
 2622 }
 2623 #endif
 2624 
 2625 /*
 2626 -------------------------------------------------------------------------------
 2627 Returns the result of adding the absolute values of the double-precision
 2628 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 2629 before being returned.  `zSign' is ignored if the result is a NaN.
 2630 The addition is performed according to the IEC/IEEE Standard for Binary
 2631 Floating-Point Arithmetic.
 2632 -------------------------------------------------------------------------------
 2633 */
 2634 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
 2635 {
 2636     int16 aExp, bExp, zExp;
 2637     bits64 aSig, bSig, zSig;
 2638     int16 expDiff;
 2639 
 2640     aSig = extractFloat64Frac( a );
 2641     aExp = extractFloat64Exp( a );
 2642     bSig = extractFloat64Frac( b );
 2643     bExp = extractFloat64Exp( b );
 2644     expDiff = aExp - bExp;
 2645     aSig <<= 9;
 2646     bSig <<= 9;
 2647     if ( 0 < expDiff ) {
 2648         if ( aExp == 0x7FF ) {
 2649             if ( aSig ) return propagateFloat64NaN( a, b );
 2650             return a;
 2651         }
 2652         if ( bExp == 0 ) {
 2653             --expDiff;
 2654         }
 2655         else {
 2656             bSig |= LIT64( 0x2000000000000000 );
 2657         }
 2658         shift64RightJamming( bSig, expDiff, &bSig );
 2659         zExp = aExp;
 2660     }
 2661     else if ( expDiff < 0 ) {
 2662         if ( bExp == 0x7FF ) {
 2663             if ( bSig ) return propagateFloat64NaN( a, b );
 2664             return packFloat64( zSign, 0x7FF, 0 );
 2665         }
 2666         if ( aExp == 0 ) {
 2667             ++expDiff;
 2668         }
 2669         else {
 2670             aSig |= LIT64( 0x2000000000000000 );
 2671         }
 2672         shift64RightJamming( aSig, - expDiff, &aSig );
 2673         zExp = bExp;
 2674     }
 2675     else {
 2676         if ( aExp == 0x7FF ) {
 2677             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
 2678             return a;
 2679         }
 2680         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
 2681         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
 2682         zExp = aExp;
 2683         goto roundAndPack;
 2684     }
 2685     aSig |= LIT64( 0x2000000000000000 );
 2686     zSig = ( aSig + bSig )<<1;
 2687     --zExp;
 2688     if ( (sbits64) zSig < 0 ) {
 2689         zSig = aSig + bSig;
 2690         ++zExp;
 2691     }
 2692  roundAndPack:
 2693     return roundAndPackFloat64( zSign, zExp, zSig );
 2694 
 2695 }
 2696 
 2697 /*
 2698 -------------------------------------------------------------------------------
 2699 Returns the result of subtracting the absolute values of the double-
 2700 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 2701 difference is negated before being returned.  `zSign' is ignored if the
 2702 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 2703 Standard for Binary Floating-Point Arithmetic.
 2704 -------------------------------------------------------------------------------
 2705 */
 2706 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
 2707 {
 2708     int16 aExp, bExp, zExp;
 2709     bits64 aSig, bSig, zSig;
 2710     int16 expDiff;
 2711 
 2712     aSig = extractFloat64Frac( a );
 2713     aExp = extractFloat64Exp( a );
 2714     bSig = extractFloat64Frac( b );
 2715     bExp = extractFloat64Exp( b );
 2716     expDiff = aExp - bExp;
 2717     aSig <<= 10;
 2718     bSig <<= 10;
 2719     if ( 0 < expDiff ) goto aExpBigger;
 2720     if ( expDiff < 0 ) goto bExpBigger;
 2721     if ( aExp == 0x7FF ) {
 2722         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
 2723         float_raise( float_flag_invalid );
 2724         return float64_default_nan;
 2725     }
 2726     if ( aExp == 0 ) {
 2727         aExp = 1;
 2728         bExp = 1;
 2729     }
 2730     if ( bSig < aSig ) goto aBigger;
 2731     if ( aSig < bSig ) goto bBigger;
 2732     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
 2733  bExpBigger:
 2734     if ( bExp == 0x7FF ) {
 2735         if ( bSig ) return propagateFloat64NaN( a, b );
 2736         return packFloat64( zSign ^ 1, 0x7FF, 0 );
 2737     }
 2738     if ( aExp == 0 ) {
 2739         ++expDiff;
 2740     }
 2741     else {
 2742         aSig |= LIT64( 0x4000000000000000 );
 2743     }
 2744     shift64RightJamming( aSig, - expDiff, &aSig );
 2745     bSig |= LIT64( 0x4000000000000000 );
 2746  bBigger:
 2747     zSig = bSig - aSig;
 2748     zExp = bExp;
 2749     zSign ^= 1;
 2750     goto normalizeRoundAndPack;
 2751  aExpBigger:
 2752     if ( aExp == 0x7FF ) {
 2753         if ( aSig ) return propagateFloat64NaN( a, b );
 2754         return a;
 2755     }
 2756     if ( bExp == 0 ) {
 2757         --expDiff;
 2758     }
 2759     else {
 2760         bSig |= LIT64( 0x4000000000000000 );
 2761     }
 2762     shift64RightJamming( bSig, expDiff, &bSig );
 2763     aSig |= LIT64( 0x4000000000000000 );
 2764  aBigger:
 2765     zSig = aSig - bSig;
 2766     zExp = aExp;
 2767  normalizeRoundAndPack:
 2768     --zExp;
 2769     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
 2770 
 2771 }
 2772 
 2773 /*
 2774 -------------------------------------------------------------------------------
 2775 Returns the result of adding the double-precision floating-point values `a'
 2776 and `b'.  The operation is performed according to the IEC/IEEE Standard for
 2777 Binary Floating-Point Arithmetic.
 2778 -------------------------------------------------------------------------------
 2779 */
 2780 float64 float64_add( float64 a, float64 b )
 2781 {
 2782     flag aSign, bSign;
 2783 
 2784     aSign = extractFloat64Sign( a );
 2785     bSign = extractFloat64Sign( b );
 2786     if ( aSign == bSign ) {
 2787         return addFloat64Sigs( a, b, aSign );
 2788     }
 2789     else {
 2790         return subFloat64Sigs( a, b, aSign );
 2791     }
 2792 
 2793 }
 2794 
 2795 /*
 2796 -------------------------------------------------------------------------------
 2797 Returns the result of subtracting the double-precision floating-point values
 2798 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 2799 for Binary Floating-Point Arithmetic.
 2800 -------------------------------------------------------------------------------
 2801 */
 2802 float64 float64_sub( float64 a, float64 b )
 2803 {
 2804     flag aSign, bSign;
 2805 
 2806     aSign = extractFloat64Sign( a );
 2807     bSign = extractFloat64Sign( b );
 2808     if ( aSign == bSign ) {
 2809         return subFloat64Sigs( a, b, aSign );
 2810     }
 2811     else {
 2812         return addFloat64Sigs( a, b, aSign );
 2813     }
 2814 
 2815 }
 2816 
 2817 /*
 2818 -------------------------------------------------------------------------------
 2819 Returns the result of multiplying the double-precision floating-point values
 2820 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 2821 for Binary Floating-Point Arithmetic.
 2822 -------------------------------------------------------------------------------
 2823 */
 2824 float64 float64_mul( float64 a, float64 b )
 2825 {
 2826     flag aSign, bSign, zSign;
 2827     int16 aExp, bExp, zExp;
 2828     bits64 aSig, bSig, zSig0, zSig1;
 2829 
 2830     aSig = extractFloat64Frac( a );
 2831     aExp = extractFloat64Exp( a );
 2832     aSign = extractFloat64Sign( a );
 2833     bSig = extractFloat64Frac( b );
 2834     bExp = extractFloat64Exp( b );
 2835     bSign = extractFloat64Sign( b );
 2836     zSign = aSign ^ bSign;
 2837     if ( aExp == 0x7FF ) {
 2838         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
 2839             return propagateFloat64NaN( a, b );
 2840         }
 2841         if ( ( bExp | bSig ) == 0 ) {
 2842             float_raise( float_flag_invalid );
 2843             return float64_default_nan;
 2844         }
 2845         return packFloat64( zSign, 0x7FF, 0 );
 2846     }
 2847     if ( bExp == 0x7FF ) {
 2848         if ( bSig ) return propagateFloat64NaN( a, b );
 2849         if ( ( aExp | aSig ) == 0 ) {
 2850             float_raise( float_flag_invalid );
 2851             return float64_default_nan;
 2852         }
 2853         return packFloat64( zSign, 0x7FF, 0 );
 2854     }
 2855     if ( aExp == 0 ) {
 2856         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
 2857         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2858     }
 2859     if ( bExp == 0 ) {
 2860         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
 2861         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2862     }
 2863     zExp = aExp + bExp - 0x3FF;
 2864     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
 2865     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2866     mul64To128( aSig, bSig, &zSig0, &zSig1 );
 2867     zSig0 |= ( zSig1 != 0 );
 2868     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
 2869         zSig0 <<= 1;
 2870         --zExp;
 2871     }
 2872     return roundAndPackFloat64( zSign, zExp, zSig0 );
 2873 
 2874 }
 2875 
 2876 /*
 2877 -------------------------------------------------------------------------------
 2878 Returns the result of dividing the double-precision floating-point value `a'
 2879 by the corresponding value `b'.  The operation is performed according to
 2880 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2881 -------------------------------------------------------------------------------
 2882 */
 2883 float64 float64_div( float64 a, float64 b )
 2884 {
 2885     flag aSign, bSign, zSign;
 2886     int16 aExp, bExp, zExp;
 2887     bits64 aSig, bSig, zSig;
 2888     bits64 rem0, rem1;
 2889     bits64 term0, term1;
 2890 
 2891     aSig = extractFloat64Frac( a );
 2892     aExp = extractFloat64Exp( a );
 2893     aSign = extractFloat64Sign( a );
 2894     bSig = extractFloat64Frac( b );
 2895     bExp = extractFloat64Exp( b );
 2896     bSign = extractFloat64Sign( b );
 2897     zSign = aSign ^ bSign;
 2898     if ( aExp == 0x7FF ) {
 2899         if ( aSig ) return propagateFloat64NaN( a, b );
 2900         if ( bExp == 0x7FF ) {
 2901             if ( bSig ) return propagateFloat64NaN( a, b );
 2902             float_raise( float_flag_invalid );
 2903             return float64_default_nan;
 2904         }
 2905         return packFloat64( zSign, 0x7FF, 0 );
 2906     }
 2907     if ( bExp == 0x7FF ) {
 2908         if ( bSig ) return propagateFloat64NaN( a, b );
 2909         return packFloat64( zSign, 0, 0 );
 2910     }
 2911     if ( bExp == 0 ) {
 2912         if ( bSig == 0 ) {
 2913             if ( ( aExp | aSig ) == 0 ) {
 2914                 float_raise( float_flag_invalid );
 2915                 return float64_default_nan;
 2916             }
 2917             float_raise( float_flag_divbyzero );
 2918             return packFloat64( zSign, 0x7FF, 0 );
 2919         }
 2920         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2921     }
 2922     if ( aExp == 0 ) {
 2923         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
 2924         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2925     }
 2926     zExp = aExp - bExp + 0x3FD;
 2927     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
 2928     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2929     if ( bSig <= ( aSig + aSig ) ) {
 2930         aSig >>= 1;
 2931         ++zExp;
 2932     }
 2933     zSig = estimateDiv128To64( aSig, 0, bSig );
 2934     if ( ( zSig & 0x1FF ) <= 2 ) {
 2935         mul64To128( bSig, zSig, &term0, &term1 );
 2936         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
 2937         while ( (sbits64) rem0 < 0 ) {
 2938             --zSig;
 2939             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
 2940         }
 2941         zSig |= ( rem1 != 0 );
 2942     }
 2943     return roundAndPackFloat64( zSign, zExp, zSig );
 2944 
 2945 }
 2946 
 2947 #ifndef SOFTFLOAT_FOR_GCC
 2948 /*
 2949 -------------------------------------------------------------------------------
 2950 Returns the remainder of the double-precision floating-point value `a'
 2951 with respect to the corresponding value `b'.  The operation is performed
 2952 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2953 -------------------------------------------------------------------------------
 2954 */
 2955 float64 float64_rem( float64 a, float64 b )
 2956 {
 2957     flag aSign, bSign, zSign;
 2958     int16 aExp, bExp, expDiff;
 2959     bits64 aSig, bSig;
 2960     bits64 q, alternateASig;
 2961     sbits64 sigMean;
 2962 
 2963     aSig = extractFloat64Frac( a );
 2964     aExp = extractFloat64Exp( a );
 2965     aSign = extractFloat64Sign( a );
 2966     bSig = extractFloat64Frac( b );
 2967     bExp = extractFloat64Exp( b );
 2968     bSign = extractFloat64Sign( b );
 2969     if ( aExp == 0x7FF ) {
 2970         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
 2971             return propagateFloat64NaN( a, b );
 2972         }
 2973         float_raise( float_flag_invalid );
 2974         return float64_default_nan;
 2975     }
 2976     if ( bExp == 0x7FF ) {
 2977         if ( bSig ) return propagateFloat64NaN( a, b );
 2978         return a;
 2979     }
 2980     if ( bExp == 0 ) {
 2981         if ( bSig == 0 ) {
 2982             float_raise( float_flag_invalid );
 2983             return float64_default_nan;
 2984         }
 2985         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2986     }
 2987     if ( aExp == 0 ) {
 2988         if ( aSig == 0 ) return a;
 2989         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2990     }
 2991     expDiff = aExp - bExp;
 2992     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
 2993     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2994     if ( expDiff < 0 ) {
 2995         if ( expDiff < -1 ) return a;
 2996         aSig >>= 1;
 2997     }
 2998     q = ( bSig <= aSig );
 2999     if ( q ) aSig -= bSig;
 3000     expDiff -= 64;
 3001     while ( 0 < expDiff ) {
 3002         q = estimateDiv128To64( aSig, 0, bSig );
 3003         q = ( 2 < q ) ? q - 2 : 0;
 3004         aSig = - ( ( bSig>>2 ) * q );
 3005         expDiff -= 62;
 3006     }
 3007     expDiff += 64;
 3008     if ( 0 < expDiff ) {
 3009         q = estimateDiv128To64( aSig, 0, bSig );
 3010         q = ( 2 < q ) ? q - 2 : 0;
 3011         q >>= 64 - expDiff;
 3012         bSig >>= 2;
 3013         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
 3014     }
 3015     else {
 3016         aSig >>= 2;
 3017         bSig >>= 2;
 3018     }
 3019     do {
 3020         alternateASig = aSig;
 3021         ++q;
 3022         aSig -= bSig;
 3023     } while ( 0 <= (sbits64) aSig );
 3024     sigMean = aSig + alternateASig;
 3025     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
 3026         aSig = alternateASig;
 3027     }
 3028     zSign = ( (sbits64) aSig < 0 );
 3029     if ( zSign ) aSig = - aSig;
 3030     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
 3031 
 3032 }
 3033 
 3034 /*
 3035 -------------------------------------------------------------------------------
 3036 Returns the square root of the double-precision floating-point value `a'.
 3037 The operation is performed according to the IEC/IEEE Standard for Binary
 3038 Floating-Point Arithmetic.
 3039 -------------------------------------------------------------------------------
 3040 */
 3041 float64 float64_sqrt( float64 a )
 3042 {
 3043     flag aSign;
 3044     int16 aExp, zExp;
 3045     bits64 aSig, zSig, doubleZSig;
 3046     bits64 rem0, rem1, term0, term1;
 3047 
 3048     aSig = extractFloat64Frac( a );
 3049     aExp = extractFloat64Exp( a );
 3050     aSign = extractFloat64Sign( a );
 3051     if ( aExp == 0x7FF ) {
 3052         if ( aSig ) return propagateFloat64NaN( a, a );
 3053         if ( ! aSign ) return a;
 3054         float_raise( float_flag_invalid );
 3055         return float64_default_nan;
 3056     }
 3057     if ( aSign ) {
 3058         if ( ( aExp | aSig ) == 0 ) return a;
 3059         float_raise( float_flag_invalid );
 3060         return float64_default_nan;
 3061     }
 3062     if ( aExp == 0 ) {
 3063         if ( aSig == 0 ) return 0;
 3064         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 3065     }
 3066     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
 3067     aSig |= LIT64( 0x0010000000000000 );
 3068     zSig = estimateSqrt32( aExp, aSig>>21 );
 3069     aSig <<= 9 - ( aExp & 1 );
 3070     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
 3071     if ( ( zSig & 0x1FF ) <= 5 ) {
 3072         doubleZSig = zSig<<1;
 3073         mul64To128( zSig, zSig, &term0, &term1 );
 3074         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
 3075         while ( (sbits64) rem0 < 0 ) {
 3076             --zSig;
 3077             doubleZSig -= 2;
 3078             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
 3079         }
 3080         zSig |= ( ( rem0 | rem1 ) != 0 );
 3081     }
 3082     return roundAndPackFloat64( 0, zExp, zSig );
 3083 
 3084 }
 3085 #endif
 3086 
 3087 /*
 3088 -------------------------------------------------------------------------------
 3089 Returns 1 if the double-precision floating-point value `a' is equal to the
 3090 corresponding value `b', and 0 otherwise.  The comparison is performed
 3091 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3092 -------------------------------------------------------------------------------
 3093 */
 3094 flag float64_eq( float64 a, float64 b )
 3095 {
 3096 
 3097     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3098          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3099        ) {
 3100         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3101             float_raise( float_flag_invalid );
 3102         }
 3103         return 0;
 3104     }
 3105     return ( a == b ) ||
 3106         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
 3107 
 3108 }
 3109 
 3110 /*
 3111 -------------------------------------------------------------------------------
 3112 Returns 1 if the double-precision floating-point value `a' is less than or
 3113 equal to the corresponding value `b', and 0 otherwise.  The comparison is
 3114 performed according to the IEC/IEEE Standard for Binary Floating-Point
 3115 Arithmetic.
 3116 -------------------------------------------------------------------------------
 3117 */
 3118 flag float64_le( float64 a, float64 b )
 3119 {
 3120     flag aSign, bSign;
 3121 
 3122     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3123          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3124        ) {
 3125         float_raise( float_flag_invalid );
 3126         return 0;
 3127     }
 3128     aSign = extractFloat64Sign( a );
 3129     bSign = extractFloat64Sign( b );
 3130     if ( aSign != bSign )
 3131         return aSign ||
 3132             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
 3133               0 );
 3134     return ( a == b ) ||
 3135         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
 3136 
 3137 }
 3138 
 3139 /*
 3140 -------------------------------------------------------------------------------
 3141 Returns 1 if the double-precision floating-point value `a' is less than
 3142 the corresponding value `b', and 0 otherwise.  The comparison is performed
 3143 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3144 -------------------------------------------------------------------------------
 3145 */
 3146 flag float64_lt( float64 a, float64 b )
 3147 {
 3148     flag aSign, bSign;
 3149 
 3150     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3151          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3152        ) {
 3153         float_raise( float_flag_invalid );
 3154         return 0;
 3155     }
 3156     aSign = extractFloat64Sign( a );
 3157     bSign = extractFloat64Sign( b );
 3158     if ( aSign != bSign )
 3159         return aSign &&
 3160             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
 3161               0 );
 3162     return ( a != b ) &&
 3163         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
 3164 
 3165 }
 3166 
 3167 #ifndef SOFTFLOAT_FOR_GCC
 3168 /*
 3169 -------------------------------------------------------------------------------
 3170 Returns 1 if the double-precision floating-point value `a' is equal to the
 3171 corresponding value `b', and 0 otherwise.  The invalid exception is raised
 3172 if either operand is a NaN.  Otherwise, the comparison is performed
 3173 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3174 -------------------------------------------------------------------------------
 3175 */
 3176 flag float64_eq_signaling( float64 a, float64 b )
 3177 {
 3178 
 3179     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3180          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3181        ) {
 3182         float_raise( float_flag_invalid );
 3183         return 0;
 3184     }
 3185     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
 3186 
 3187 }
 3188 
 3189 /*
 3190 -------------------------------------------------------------------------------
 3191 Returns 1 if the double-precision floating-point value `a' is less than or
 3192 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 3193 cause an exception.  Otherwise, the comparison is performed according to the
 3194 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3195 -------------------------------------------------------------------------------
 3196 */
 3197 flag float64_le_quiet( float64 a, float64 b )
 3198 {
 3199     flag aSign, bSign;
 3200 
 3201     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3202          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3203        ) {
 3204         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3205             float_raise( float_flag_invalid );
 3206         }
 3207         return 0;
 3208     }
 3209     aSign = extractFloat64Sign( a );
 3210     bSign = extractFloat64Sign( b );
 3211     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
 3212     return ( a == b ) || ( aSign ^ ( a < b ) );
 3213 
 3214 }
 3215 
 3216 /*
 3217 -------------------------------------------------------------------------------
 3218 Returns 1 if the double-precision floating-point value `a' is less than
 3219 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 3220 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 3221 Standard for Binary Floating-Point Arithmetic.
 3222 -------------------------------------------------------------------------------
 3223 */
 3224 flag float64_lt_quiet( float64 a, float64 b )
 3225 {
 3226     flag aSign, bSign;
 3227 
 3228     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3229          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3230        ) {
 3231         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3232             float_raise( float_flag_invalid );
 3233         }
 3234         return 0;
 3235     }
 3236     aSign = extractFloat64Sign( a );
 3237     bSign = extractFloat64Sign( b );
 3238     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
 3239     return ( a != b ) && ( aSign ^ ( a < b ) );
 3240 
 3241 }
 3242 #endif
 3243 
 3244 #ifdef FLOATX80
 3245 
 3246 /*
 3247 -------------------------------------------------------------------------------
 3248 Returns the result of converting the extended double-precision floating-
 3249 point value `a' to the 32-bit two's complement integer format.  The
 3250 conversion is performed according to the IEC/IEEE Standard for Binary
 3251 Floating-Point Arithmetic---which means in particular that the conversion
 3252 is rounded according to the current rounding mode.  If `a' is a NaN, the
 3253 largest positive integer is returned.  Otherwise, if the conversion
 3254 overflows, the largest integer with the same sign as `a' is returned.
 3255 -------------------------------------------------------------------------------
 3256 */
 3257 int32 floatx80_to_int32( floatx80 a )
 3258 {
 3259     flag aSign;
 3260     int32 aExp, shiftCount;
 3261     bits64 aSig;
 3262 
 3263     aSig = extractFloatx80Frac( a );
 3264     aExp = extractFloatx80Exp( a );
 3265     aSign = extractFloatx80Sign( a );
 3266     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 3267     shiftCount = 0x4037 - aExp;
 3268     if ( shiftCount <= 0 ) shiftCount = 1;
 3269     shift64RightJamming( aSig, shiftCount, &aSig );
 3270     return roundAndPackInt32( aSign, aSig );
 3271 
 3272 }
 3273 
 3274 /*
 3275 -------------------------------------------------------------------------------
 3276 Returns the result of converting the extended double-precision floating-
 3277 point value `a' to the 32-bit two's complement integer format.  The
 3278 conversion is performed according to the IEC/IEEE Standard for Binary
 3279 Floating-Point Arithmetic, except that the conversion is always rounded
 3280 toward zero.  If `a' is a NaN, the largest positive integer is returned.
 3281 Otherwise, if the conversion overflows, the largest integer with the same
 3282 sign as `a' is returned.
 3283 -------------------------------------------------------------------------------
 3284 */
 3285 int32 floatx80_to_int32_round_to_zero( floatx80 a )
 3286 {
 3287     flag aSign;
 3288     int32 aExp, shiftCount;
 3289     bits64 aSig, savedASig;
 3290     int32 z;
 3291 
 3292     aSig = extractFloatx80Frac( a );
 3293     aExp = extractFloatx80Exp( a );
 3294     aSign = extractFloatx80Sign( a );
 3295     if ( 0x401E < aExp ) {
 3296         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 3297         goto invalid;
 3298     }
 3299     else if ( aExp < 0x3FFF ) {
 3300         if ( aExp || aSig ) float_set_inexact();
 3301         return 0;
 3302     }
 3303     shiftCount = 0x403E - aExp;
 3304     savedASig = aSig;
 3305     aSig >>= shiftCount;
 3306     z = aSig;
 3307     if ( aSign ) z = - z;
 3308     if ( ( z < 0 ) ^ aSign ) {
 3309  invalid:
 3310         float_raise( float_flag_invalid );
 3311         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 3312     }
 3313     if ( ( aSig<<shiftCount ) != savedASig ) {
 3314         float_set_inexact();
 3315     }
 3316     return z;
 3317 
 3318 }
 3319 
 3320 /*
 3321 -------------------------------------------------------------------------------
 3322 Returns the result of converting the extended double-precision floating-
 3323 point value `a' to the 64-bit two's complement integer format.  The
 3324 conversion is performed according to the IEC/IEEE Standard for Binary
 3325 Floating-Point Arithmetic---which means in particular that the conversion
 3326 is rounded according to the current rounding mode.  If `a' is a NaN,
 3327 the largest positive integer is returned.  Otherwise, if the conversion
 3328 overflows, the largest integer with the same sign as `a' is returned.
 3329 -------------------------------------------------------------------------------
 3330 */
 3331 int64 floatx80_to_int64( floatx80 a )
 3332 {
 3333     flag aSign;
 3334     int32 aExp, shiftCount;
 3335     bits64 aSig, aSigExtra;
 3336 
 3337     aSig = extractFloatx80Frac( a );
 3338     aExp = extractFloatx80Exp( a );
 3339     aSign = extractFloatx80Sign( a );
 3340     shiftCount = 0x403E - aExp;
 3341     if ( shiftCount <= 0 ) {
 3342         if ( shiftCount ) {
 3343             float_raise( float_flag_invalid );
 3344             if (    ! aSign
 3345                  || (    ( aExp == 0x7FFF )
 3346                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
 3347                ) {
 3348                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 3349             }
 3350             return (sbits64) LIT64( 0x8000000000000000 );
 3351         }
 3352         aSigExtra = 0;
 3353     }
 3354     else {
 3355         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
 3356     }
 3357     return roundAndPackInt64( aSign, aSig, aSigExtra );
 3358 
 3359 }
 3360 
 3361 /*
 3362 -------------------------------------------------------------------------------
 3363 Returns the result of converting the extended double-precision floating-
 3364 point value `a' to the 64-bit two's complement integer format.  The
 3365 conversion is performed according to the IEC/IEEE Standard for Binary
 3366 Floating-Point Arithmetic, except that the conversion is always rounded
 3367 toward zero.  If `a' is a NaN, the largest positive integer is returned.
 3368 Otherwise, if the conversion overflows, the largest integer with the same
 3369 sign as `a' is returned.
 3370 -------------------------------------------------------------------------------
 3371 */
 3372 int64 floatx80_to_int64_round_to_zero( floatx80 a )
 3373 {
 3374     flag aSign;
 3375     int32 aExp, shiftCount;
 3376     bits64 aSig;
 3377     int64 z;
 3378 
 3379     aSig = extractFloatx80Frac( a );
 3380     aExp = extractFloatx80Exp( a );
 3381     aSign = extractFloatx80Sign( a );
 3382     shiftCount = aExp - 0x403E;
 3383     if ( 0 <= shiftCount ) {
 3384         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
 3385         if ( ( a.high != 0xC03E ) || aSig ) {
 3386             float_raise( float_flag_invalid );
 3387             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
 3388                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 3389             }
 3390         }
 3391         return (sbits64) LIT64( 0x8000000000000000 );
 3392     }
 3393     else if ( aExp < 0x3FFF ) {
 3394         if ( aExp | aSig ) float_set_inexact();
 3395         return 0;
 3396     }
 3397     z = aSig>>( - shiftCount );
 3398     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
 3399         float_set_inexact();
 3400     }
 3401     if ( aSign ) z = - z;
 3402     return z;
 3403 
 3404 }
 3405 
 3406 /*
 3407 -------------------------------------------------------------------------------
 3408 Returns the result of converting the extended double-precision floating-
 3409 point value `a' to the single-precision floating-point format.  The
 3410 conversion is performed according to the IEC/IEEE Standard for Binary
 3411 Floating-Point Arithmetic.
 3412 -------------------------------------------------------------------------------
 3413 */
 3414 float32 floatx80_to_float32( floatx80 a )
 3415 {
 3416     flag aSign;
 3417     int32 aExp;
 3418     bits64 aSig;
 3419 
 3420     aSig = extractFloatx80Frac( a );
 3421     aExp = extractFloatx80Exp( a );
 3422     aSign = extractFloatx80Sign( a );
 3423     if ( aExp == 0x7FFF ) {
 3424         if ( (bits64) ( aSig<<1 ) ) {
 3425             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
 3426         }
 3427         return packFloat32( aSign, 0xFF, 0 );
 3428     }
 3429     shift64RightJamming( aSig, 33, &aSig );
 3430     if ( aExp || aSig ) aExp -= 0x3F81;
 3431     return roundAndPackFloat32( aSign, aExp, aSig );
 3432 
 3433 }
 3434 
 3435 /*
 3436 -------------------------------------------------------------------------------
 3437 Returns the result of converting the extended double-precision floating-
 3438 point value `a' to the double-precision floating-point format.  The
 3439 conversion is performed according to the IEC/IEEE Standard for Binary
 3440 Floating-Point Arithmetic.
 3441 -------------------------------------------------------------------------------
 3442 */
 3443 float64 floatx80_to_float64( floatx80 a )
 3444 {
 3445     flag aSign;
 3446     int32 aExp;
 3447     bits64 aSig, zSig;
 3448 
 3449     aSig = extractFloatx80Frac( a );
 3450     aExp = extractFloatx80Exp( a );
 3451     aSign = extractFloatx80Sign( a );
 3452     if ( aExp == 0x7FFF ) {
 3453         if ( (bits64) ( aSig<<1 ) ) {
 3454             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
 3455         }
 3456         return packFloat64( aSign, 0x7FF, 0 );
 3457     }
 3458     shift64RightJamming( aSig, 1, &zSig );
 3459     if ( aExp || aSig ) aExp -= 0x3C01;
 3460     return roundAndPackFloat64( aSign, aExp, zSig );
 3461 
 3462 }
 3463 
 3464 #ifdef FLOAT128
 3465 
 3466 /*
 3467 -------------------------------------------------------------------------------
 3468 Returns the result of converting the extended double-precision floating-
 3469 point value `a' to the quadruple-precision floating-point format.  The
 3470 conversion is performed according to the IEC/IEEE Standard for Binary
 3471 Floating-Point Arithmetic.
 3472 -------------------------------------------------------------------------------
 3473 */
 3474 float128 floatx80_to_float128( floatx80 a )
 3475 {
 3476     flag aSign;
 3477     int16 aExp;
 3478     bits64 aSig, zSig0, zSig1;
 3479 
 3480     aSig = extractFloatx80Frac( a );
 3481     aExp = extractFloatx80Exp( a );
 3482     aSign = extractFloatx80Sign( a );
 3483     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
 3484         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
 3485     }
 3486     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
 3487     return packFloat128( aSign, aExp, zSig0, zSig1 );
 3488 
 3489 }
 3490 
 3491 #endif
 3492 
 3493 /*
 3494 -------------------------------------------------------------------------------
 3495 Rounds the extended double-precision floating-point value `a' to an integer,
 3496 and returns the result as an extended quadruple-precision floating-point
 3497 value.  The operation is performed according to the IEC/IEEE Standard for
 3498 Binary Floating-Point Arithmetic.
 3499 -------------------------------------------------------------------------------
 3500 */
 3501 floatx80 floatx80_round_to_int( floatx80 a )
 3502 {
 3503     flag aSign;
 3504     int32 aExp;
 3505     bits64 lastBitMask, roundBitsMask;
 3506     int8 roundingMode;
 3507     floatx80 z;
 3508 
 3509     aExp = extractFloatx80Exp( a );
 3510     if ( 0x403E <= aExp ) {
 3511         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
 3512             return propagateFloatx80NaN( a, a );
 3513         }
 3514         return a;
 3515     }
 3516     if ( aExp < 0x3FFF ) {
 3517         if (    ( aExp == 0 )
 3518              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
 3519             return a;
 3520         }
 3521         float_set_inexact();
 3522         aSign = extractFloatx80Sign( a );
 3523         switch ( float_rounding_mode() ) {
 3524          case float_round_nearest_even:
 3525             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
 3526                ) {
 3527                 return
 3528                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
 3529             }
 3530             break;
 3531          case float_round_down:
 3532             return
 3533                   aSign ?
 3534                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
 3535                 : packFloatx80( 0, 0, 0 );
 3536          case float_round_up:
 3537             return
 3538                   aSign ? packFloatx80( 1, 0, 0 )
 3539                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
 3540         }
 3541         return packFloatx80( aSign, 0, 0 );
 3542     }
 3543     lastBitMask = 1;
 3544     lastBitMask <<= 0x403E - aExp;
 3545     roundBitsMask = lastBitMask - 1;
 3546     z = a;
 3547     roundingMode = float_rounding_mode();
 3548     if ( roundingMode == float_round_nearest_even ) {
 3549         z.low += lastBitMask>>1;
 3550         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
 3551     }
 3552     else if ( roundingMode != float_round_to_zero ) {
 3553         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 3554             z.low += roundBitsMask;
 3555         }
 3556     }
 3557     z.low &= ~ roundBitsMask;
 3558     if ( z.low == 0 ) {
 3559         ++z.high;
 3560         z.low = LIT64( 0x8000000000000000 );
 3561     }
 3562     if ( z.low != a.low ) float_set_inexact();
 3563     return z;
 3564 
 3565 }
 3566 
 3567 /*
 3568 -------------------------------------------------------------------------------
 3569 Returns the result of adding the absolute values of the extended double-
 3570 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
 3571 negated before being returned.  `zSign' is ignored if the result is a NaN.
 3572 The addition is performed according to the IEC/IEEE Standard for Binary
 3573 Floating-Point Arithmetic.
 3574 -------------------------------------------------------------------------------
 3575 */
 3576 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
 3577 {
 3578     int32 aExp, bExp, zExp;
 3579     bits64 aSig, bSig, zSig0, zSig1;
 3580     int32 expDiff;
 3581 
 3582     aSig = extractFloatx80Frac( a );
 3583     aExp = extractFloatx80Exp( a );
 3584     bSig = extractFloatx80Frac( b );
 3585     bExp = extractFloatx80Exp( b );
 3586     expDiff = aExp - bExp;
 3587     if ( 0 < expDiff ) {
 3588         if ( aExp == 0x7FFF ) {
 3589             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3590             return a;
 3591         }
 3592         if ( bExp == 0 ) --expDiff;
 3593         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
 3594         zExp = aExp;
 3595     }
 3596     else if ( expDiff < 0 ) {
 3597         if ( bExp == 0x7FFF ) {
 3598             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3599             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3600         }
 3601         if ( aExp == 0 ) ++expDiff;
 3602         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
 3603         zExp = bExp;
 3604     }
 3605     else {
 3606         if ( aExp == 0x7FFF ) {
 3607             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
 3608                 return propagateFloatx80NaN( a, b );
 3609             }
 3610             return a;
 3611         }
 3612         zSig1 = 0;
 3613         zSig0 = aSig + bSig;
 3614         if ( aExp == 0 ) {
 3615             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
 3616             goto roundAndPack;
 3617         }
 3618         zExp = aExp;
 3619         goto shiftRight1;
 3620     }
 3621     zSig0 = aSig + bSig;
 3622     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
 3623  shiftRight1:
 3624     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
 3625     zSig0 |= LIT64( 0x8000000000000000 );
 3626     ++zExp;
 3627  roundAndPack:
 3628     return
 3629         roundAndPackFloatx80(
 3630             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3631 
 3632 }
 3633 
 3634 /*
 3635 -------------------------------------------------------------------------------
 3636 Returns the result of subtracting the absolute values of the extended
 3637 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
 3638 difference is negated before being returned.  `zSign' is ignored if the
 3639 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 3640 Standard for Binary Floating-Point Arithmetic.
 3641 -------------------------------------------------------------------------------
 3642 */
 3643 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
 3644 {
 3645     int32 aExp, bExp, zExp;
 3646     bits64 aSig, bSig, zSig0, zSig1;
 3647     int32 expDiff;
 3648     floatx80 z;
 3649 
 3650     aSig = extractFloatx80Frac( a );
 3651     aExp = extractFloatx80Exp( a );
 3652     bSig = extractFloatx80Frac( b );
 3653     bExp = extractFloatx80Exp( b );
 3654     expDiff = aExp - bExp;
 3655     if ( 0 < expDiff ) goto aExpBigger;
 3656     if ( expDiff < 0 ) goto bExpBigger;
 3657     if ( aExp == 0x7FFF ) {
 3658         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
 3659             return propagateFloatx80NaN( a, b );
 3660         }
 3661         float_raise( float_flag_invalid );
 3662         z.low = floatx80_default_nan_low;
 3663         z.high = floatx80_default_nan_high;
 3664         return z;
 3665     }
 3666     if ( aExp == 0 ) {
 3667         aExp = 1;
 3668         bExp = 1;
 3669     }
 3670     zSig1 = 0;
 3671     if ( bSig < aSig ) goto aBigger;
 3672     if ( aSig < bSig ) goto bBigger;
 3673     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
 3674  bExpBigger:
 3675     if ( bExp == 0x7FFF ) {
 3676         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3677         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3678     }
 3679     if ( aExp == 0 ) ++expDiff;
 3680     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
 3681  bBigger:
 3682     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
 3683     zExp = bExp;
 3684     zSign ^= 1;
 3685     goto normalizeRoundAndPack;
 3686  aExpBigger:
 3687     if ( aExp == 0x7FFF ) {
 3688         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3689         return a;
 3690     }
 3691     if ( bExp == 0 ) --expDiff;
 3692     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
 3693  aBigger:
 3694     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
 3695     zExp = aExp;
 3696  normalizeRoundAndPack:
 3697     return
 3698         normalizeRoundAndPackFloatx80(
 3699             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3700 
 3701 }
 3702 
 3703 /*
 3704 -------------------------------------------------------------------------------
 3705 Returns the result of adding the extended double-precision floating-point
 3706 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 3707 Standard for Binary Floating-Point Arithmetic.
 3708 -------------------------------------------------------------------------------
 3709 */
 3710 floatx80 floatx80_add( floatx80 a, floatx80 b )
 3711 {
 3712     flag aSign, bSign;
 3713 
 3714     aSign = extractFloatx80Sign( a );
 3715     bSign = extractFloatx80Sign( b );
 3716     if ( aSign == bSign ) {
 3717         return addFloatx80Sigs( a, b, aSign );
 3718     }
 3719     else {
 3720         return subFloatx80Sigs( a, b, aSign );
 3721     }
 3722 
 3723 }
 3724 
 3725 /*
 3726 -------------------------------------------------------------------------------
 3727 Returns the result of subtracting the extended double-precision floating-
 3728 point values `a' and `b'.  The operation is performed according to the
 3729 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3730 -------------------------------------------------------------------------------
 3731 */
 3732 floatx80 floatx80_sub( floatx80 a, floatx80 b )
 3733 {
 3734     flag aSign, bSign;
 3735 
 3736     aSign = extractFloatx80Sign( a );
 3737     bSign = extractFloatx80Sign( b );
 3738     if ( aSign == bSign ) {
 3739         return subFloatx80Sigs( a, b, aSign );
 3740     }
 3741     else {
 3742         return addFloatx80Sigs( a, b, aSign );
 3743     }
 3744 
 3745 }
 3746 
 3747 /*
 3748 -------------------------------------------------------------------------------
 3749 Returns the result of multiplying the extended double-precision floating-
 3750 point values `a' and `b'.  The operation is performed according to the
 3751 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3752 -------------------------------------------------------------------------------
 3753 */
 3754 floatx80 floatx80_mul( floatx80 a, floatx80 b )
 3755 {
 3756     flag aSign, bSign, zSign;
 3757     int32 aExp, bExp, zExp;
 3758     bits64 aSig, bSig, zSig0, zSig1;
 3759     floatx80 z;
 3760 
 3761     aSig = extractFloatx80Frac( a );
 3762     aExp = extractFloatx80Exp( a );
 3763     aSign = extractFloatx80Sign( a );
 3764     bSig = extractFloatx80Frac( b );
 3765     bExp = extractFloatx80Exp( b );
 3766     bSign = extractFloatx80Sign( b );
 3767     zSign = aSign ^ bSign;
 3768     if ( aExp == 0x7FFF ) {
 3769         if (    (bits64) ( aSig<<1 )
 3770              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
 3771             return propagateFloatx80NaN( a, b );
 3772         }
 3773         if ( ( bExp | bSig ) == 0 ) goto invalid;
 3774         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3775     }
 3776     if ( bExp == 0x7FFF ) {
 3777         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3778         if ( ( aExp | aSig ) == 0 ) {
 3779  invalid:
 3780             float_raise( float_flag_invalid );
 3781             z.low = floatx80_default_nan_low;
 3782             z.high = floatx80_default_nan_high;
 3783             return z;
 3784         }
 3785         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3786     }
 3787     if ( aExp == 0 ) {
 3788         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3789         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
 3790     }
 3791     if ( bExp == 0 ) {
 3792         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3793         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3794     }
 3795     zExp = aExp + bExp - 0x3FFE;
 3796     mul64To128( aSig, bSig, &zSig0, &zSig1 );
 3797     if ( 0 < (sbits64) zSig0 ) {
 3798         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
 3799         --zExp;
 3800     }
 3801     return
 3802         roundAndPackFloatx80(
 3803             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3804 
 3805 }
 3806 
 3807 /*
 3808 -------------------------------------------------------------------------------
 3809 Returns the result of dividing the extended double-precision floating-point
 3810 value `a' by the corresponding value `b'.  The operation is performed
 3811 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3812 -------------------------------------------------------------------------------
 3813 */
 3814 floatx80 floatx80_div( floatx80 a, floatx80 b )
 3815 {
 3816     flag aSign, bSign, zSign;
 3817     int32 aExp, bExp, zExp;
 3818     bits64 aSig, bSig, zSig0, zSig1;
 3819     bits64 rem0, rem1, rem2, term0, term1, term2;
 3820     floatx80 z;
 3821 
 3822     aSig = extractFloatx80Frac( a );
 3823     aExp = extractFloatx80Exp( a );
 3824     aSign = extractFloatx80Sign( a );
 3825     bSig = extractFloatx80Frac( b );
 3826     bExp = extractFloatx80Exp( b );
 3827     bSign = extractFloatx80Sign( b );
 3828     zSign = aSign ^ bSign;
 3829     if ( aExp == 0x7FFF ) {
 3830         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3831         if ( bExp == 0x7FFF ) {
 3832             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3833             goto invalid;
 3834         }
 3835         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3836     }
 3837     if ( bExp == 0x7FFF ) {
 3838         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3839         return packFloatx80( zSign, 0, 0 );
 3840     }
 3841     if ( bExp == 0 ) {
 3842         if ( bSig == 0 ) {
 3843             if ( ( aExp | aSig ) == 0 ) {
 3844  invalid:
 3845                 float_raise( float_flag_invalid );
 3846                 z.low = floatx80_default_nan_low;
 3847                 z.high = floatx80_default_nan_high;
 3848                 return z;
 3849             }
 3850             float_raise( float_flag_divbyzero );
 3851             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3852         }
 3853         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3854     }
 3855     if ( aExp == 0 ) {
 3856         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3857         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
 3858     }
 3859     zExp = aExp - bExp + 0x3FFE;
 3860     rem1 = 0;
 3861     if ( bSig <= aSig ) {
 3862         shift128Right( aSig, 0, 1, &aSig, &rem1 );
 3863         ++zExp;
 3864     }
 3865     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
 3866     mul64To128( bSig, zSig0, &term0, &term1 );
 3867     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
 3868     while ( (sbits64) rem0 < 0 ) {
 3869         --zSig0;
 3870         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
 3871     }
 3872     zSig1 = estimateDiv128To64( rem1, 0, bSig );
 3873     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
 3874         mul64To128( bSig, zSig1, &term1, &term2 );
 3875         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 3876         while ( (sbits64) rem1 < 0 ) {
 3877             --zSig1;
 3878             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
 3879         }
 3880         zSig1 |= ( ( rem1 | rem2 ) != 0 );
 3881     }
 3882     return
 3883         roundAndPackFloatx80(
 3884             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3885 
 3886 }
 3887 
 3888 /*
 3889 -------------------------------------------------------------------------------
 3890 Returns the remainder of the extended double-precision floating-point value
 3891 `a' with respect to the corresponding value `b'.  The operation is performed
 3892 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3893 -------------------------------------------------------------------------------
 3894 */
 3895 floatx80 floatx80_rem( floatx80 a, floatx80 b )
 3896 {
 3897     flag aSign, bSign, zSign;
 3898     int32 aExp, bExp, expDiff;
 3899     bits64 aSig0, aSig1, bSig;
 3900     bits64 q, term0, term1, alternateASig0, alternateASig1;
 3901     floatx80 z;
 3902 
 3903     aSig0 = extractFloatx80Frac( a );
 3904     aExp = extractFloatx80Exp( a );
 3905     aSign = extractFloatx80Sign( a );
 3906     bSig = extractFloatx80Frac( b );
 3907     bExp = extractFloatx80Exp( b );
 3908     bSign = extractFloatx80Sign( b );
 3909     if ( aExp == 0x7FFF ) {
 3910         if (    (bits64) ( aSig0<<1 )
 3911              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
 3912             return propagateFloatx80NaN( a, b );
 3913         }
 3914         goto invalid;
 3915     }
 3916     if ( bExp == 0x7FFF ) {
 3917         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3918         return a;
 3919     }
 3920     if ( bExp == 0 ) {
 3921         if ( bSig == 0 ) {
 3922  invalid:
 3923             float_raise( float_flag_invalid );
 3924             z.low = floatx80_default_nan_low;
 3925             z.high = floatx80_default_nan_high;
 3926             return z;
 3927         }
 3928         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3929     }
 3930     if ( aExp == 0 ) {
 3931         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
 3932         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
 3933     }
 3934     bSig |= LIT64( 0x8000000000000000 );
 3935     zSign = aSign;
 3936     expDiff = aExp - bExp;
 3937     aSig1 = 0;
 3938     if ( expDiff < 0 ) {
 3939         if ( expDiff < -1 ) return a;
 3940         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
 3941         expDiff = 0;
 3942     }
 3943     q = ( bSig <= aSig0 );
 3944     if ( q ) aSig0 -= bSig;
 3945     expDiff -= 64;
 3946     while ( 0 < expDiff ) {
 3947         q = estimateDiv128To64( aSig0, aSig1, bSig );
 3948         q = ( 2 < q ) ? q - 2 : 0;
 3949         mul64To128( bSig, q, &term0, &term1 );
 3950         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3951         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
 3952         expDiff -= 62;
 3953     }
 3954     expDiff += 64;
 3955     if ( 0 < expDiff ) {
 3956         q = estimateDiv128To64( aSig0, aSig1, bSig );
 3957         q = ( 2 < q ) ? q - 2 : 0;
 3958         q >>= 64 - expDiff;
 3959         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
 3960         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3961         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
 3962         while ( le128( term0, term1, aSig0, aSig1 ) ) {
 3963             ++q;
 3964             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3965         }
 3966     }
 3967     else {
 3968         term1 = 0;
 3969         term0 = bSig;
 3970     }
 3971     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
 3972     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
 3973          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
 3974               && ( q & 1 ) )
 3975        ) {
 3976         aSig0 = alternateASig0;
 3977         aSig1 = alternateASig1;
 3978         zSign = ! zSign;
 3979     }
 3980     return
 3981         normalizeRoundAndPackFloatx80(
 3982             80, zSign, bExp + expDiff, aSig0, aSig1 );
 3983 
 3984 }
 3985 
 3986 /*
 3987 -------------------------------------------------------------------------------
 3988 Returns the square root of the extended double-precision floating-point
 3989 value `a'.  The operation is performed according to the IEC/IEEE Standard
 3990 for Binary Floating-Point Arithmetic.
 3991 -------------------------------------------------------------------------------
 3992 */
 3993 floatx80 floatx80_sqrt( floatx80 a )
 3994 {
 3995     flag aSign;
 3996     int32 aExp, zExp;
 3997     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
 3998     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 3999     floatx80 z;
 4000 
 4001     aSig0 = extractFloatx80Frac( a );
 4002     aExp = extractFloatx80Exp( a );
 4003     aSign = extractFloatx80Sign( a );
 4004     if ( aExp == 0x7FFF ) {
 4005         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
 4006         if ( ! aSign ) return a;
 4007         goto invalid;
 4008     }
 4009     if ( aSign ) {
 4010         if ( ( aExp | aSig0 ) == 0 ) return a;
 4011  invalid:
 4012         float_raise( float_flag_invalid );
 4013         z.low = floatx80_default_nan_low;
 4014         z.high = floatx80_default_nan_high;
 4015         return z;
 4016     }
 4017     if ( aExp == 0 ) {
 4018         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
 4019         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
 4020     }
 4021     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
 4022     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
 4023     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
 4024     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
 4025     doubleZSig0 = zSig0<<1;
 4026     mul64To128( zSig0, zSig0, &term0, &term1 );
 4027     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
 4028     while ( (sbits64) rem0 < 0 ) {
 4029         --zSig0;
 4030         doubleZSig0 -= 2;
 4031         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
 4032     }
 4033     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
 4034     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
 4035         if ( zSig1 == 0 ) zSig1 = 1;
 4036         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
 4037         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 4038         mul64To128( zSig1, zSig1, &term2, &term3 );
 4039         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
 4040         while ( (sbits64) rem1 < 0 ) {
 4041             --zSig1;
 4042             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
 4043             term3 |= 1;
 4044             term2 |= doubleZSig0;
 4045             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
 4046         }
 4047         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 4048     }
 4049     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
 4050     zSig0 |= doubleZSig0;
 4051     return
 4052         roundAndPackFloatx80(
 4053             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
 4054 
 4055 }
 4056 
 4057 /*
 4058 -------------------------------------------------------------------------------
 4059 Returns 1 if the extended double-precision floating-point value `a' is
 4060 equal to the corresponding value `b', and 0 otherwise.  The comparison is
 4061 performed according to the IEC/IEEE Standard for Binary Floating-Point
 4062 Arithmetic.
 4063 -------------------------------------------------------------------------------
 4064 */
 4065 flag floatx80_eq( floatx80 a, floatx80 b )
 4066 {
 4067 
 4068     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4069               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4070          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4071               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4072        ) {
 4073         if (    floatx80_is_signaling_nan( a )
 4074              || floatx80_is_signaling_nan( b ) ) {
 4075             float_raise( float_flag_invalid );
 4076         }
 4077         return 0;
 4078     }
 4079     return
 4080            ( a.low == b.low )
 4081         && (    ( a.high == b.high )
 4082              || (    ( a.low == 0 )
 4083                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
 4084            );
 4085 
 4086 }
 4087 
 4088 /*
 4089 -------------------------------------------------------------------------------
 4090 Returns 1 if the extended double-precision floating-point value `a' is
 4091 less than or equal to the corresponding value `b', and 0 otherwise.  The
 4092 comparison is performed according to the IEC/IEEE Standard for Binary
 4093 Floating-Point Arithmetic.
 4094 -------------------------------------------------------------------------------
 4095 */
 4096 flag floatx80_le( floatx80 a, floatx80 b )
 4097 {
 4098     flag aSign, bSign;
 4099 
 4100     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4101               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4102          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4103               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4104        ) {
 4105         float_raise( float_flag_invalid );
 4106         return 0;
 4107     }
 4108     aSign = extractFloatx80Sign( a );
 4109     bSign = extractFloatx80Sign( b );
 4110     if ( aSign != bSign ) {
 4111         return
 4112                aSign
 4113             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4114                  == 0 );
 4115     }
 4116     return
 4117           aSign ? le128( b.high, b.low, a.high, a.low )
 4118         : le128( a.high, a.low, b.high, b.low );
 4119 
 4120 }
 4121 
 4122 /*
 4123 -------------------------------------------------------------------------------
 4124 Returns 1 if the extended double-precision floating-point value `a' is
 4125 less than the corresponding value `b', and 0 otherwise.  The comparison
 4126 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4127 Arithmetic.
 4128 -------------------------------------------------------------------------------
 4129 */
 4130 flag floatx80_lt( floatx80 a, floatx80 b )
 4131 {
 4132     flag aSign, bSign;
 4133 
 4134     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4135               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4136          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4137               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4138        ) {
 4139         float_raise( float_flag_invalid );
 4140         return 0;
 4141     }
 4142     aSign = extractFloatx80Sign( a );
 4143     bSign = extractFloatx80Sign( b );
 4144     if ( aSign != bSign ) {
 4145         return
 4146                aSign
 4147             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4148                  != 0 );
 4149     }
 4150     return
 4151           aSign ? lt128( b.high, b.low, a.high, a.low )
 4152         : lt128( a.high, a.low, b.high, b.low );
 4153 
 4154 }
 4155 
 4156 /*
 4157 -------------------------------------------------------------------------------
 4158 Returns 1 if the extended double-precision floating-point value `a' is equal
 4159 to the corresponding value `b', and 0 otherwise.  The invalid exception is
 4160 raised if either operand is a NaN.  Otherwise, the comparison is performed
 4161 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4162 -------------------------------------------------------------------------------
 4163 */
 4164 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
 4165 {
 4166 
 4167     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4168               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4169          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4170               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4171        ) {
 4172         float_raise( float_flag_invalid );
 4173         return 0;
 4174     }
 4175     return
 4176            ( a.low == b.low )
 4177         && (    ( a.high == b.high )
 4178              || (    ( a.low == 0 )
 4179                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
 4180            );
 4181 
 4182 }
 4183 
 4184 /*
 4185 -------------------------------------------------------------------------------
 4186 Returns 1 if the extended double-precision floating-point value `a' is less
 4187 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
 4188 do not cause an exception.  Otherwise, the comparison is performed according
 4189 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4190 -------------------------------------------------------------------------------
 4191 */
 4192 flag floatx80_le_quiet( floatx80 a, floatx80 b )
 4193 {
 4194     flag aSign, bSign;
 4195 
 4196     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4197               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4198          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4199               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4200        ) {
 4201         if (    floatx80_is_signaling_nan( a )
 4202              || floatx80_is_signaling_nan( b ) ) {
 4203             float_raise( float_flag_invalid );
 4204         }
 4205         return 0;
 4206     }
 4207     aSign = extractFloatx80Sign( a );
 4208     bSign = extractFloatx80Sign( b );
 4209     if ( aSign != bSign ) {
 4210         return
 4211                aSign
 4212             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4213                  == 0 );
 4214     }
 4215     return
 4216           aSign ? le128( b.high, b.low, a.high, a.low )
 4217         : le128( a.high, a.low, b.high, b.low );
 4218 
 4219 }
 4220 
 4221 /*
 4222 -------------------------------------------------------------------------------
 4223 Returns 1 if the extended double-precision floating-point value `a' is less
 4224 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
 4225 an exception.  Otherwise, the comparison is performed according to the
 4226 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4227 -------------------------------------------------------------------------------
 4228 */
 4229 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
 4230 {
 4231     flag aSign, bSign;
 4232 
 4233     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4234               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4235          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4236               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4237        ) {
 4238         if (    floatx80_is_signaling_nan( a )
 4239              || floatx80_is_signaling_nan( b ) ) {
 4240             float_raise( float_flag_invalid );
 4241         }
 4242         return 0;
 4243     }
 4244     aSign = extractFloatx80Sign( a );
 4245     bSign = extractFloatx80Sign( b );
 4246     if ( aSign != bSign ) {
 4247         return
 4248                aSign
 4249             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4250                  != 0 );
 4251     }
 4252     return
 4253           aSign ? lt128( b.high, b.low, a.high, a.low )
 4254         : lt128( a.high, a.low, b.high, b.low );
 4255 
 4256 }
 4257 
 4258 #endif
 4259 
 4260 #ifdef FLOAT128
 4261 
 4262 /*
 4263 -------------------------------------------------------------------------------
 4264 Returns the result of converting the quadruple-precision floating-point
 4265 value `a' to the 32-bit two's complement integer format.  The conversion
 4266 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4267 Arithmetic---which means in particular that the conversion is rounded
 4268 according to the current rounding mode.  If `a' is a NaN, the largest
 4269 positive integer is returned.  Otherwise, if the conversion overflows, the
 4270 largest integer with the same sign as `a' is returned.
 4271 -------------------------------------------------------------------------------
 4272 */
 4273 int32 float128_to_int32( float128 a )
 4274 {
 4275     flag aSign;
 4276     int32 aExp, shiftCount;
 4277     bits64 aSig0, aSig1;
 4278 
 4279     aSig1 = extractFloat128Frac1( a );
 4280     aSig0 = extractFloat128Frac0( a );
 4281     aExp = extractFloat128Exp( a );
 4282     aSign = extractFloat128Sign( a );
 4283     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
 4284     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4285     aSig0 |= ( aSig1 != 0 );
 4286     shiftCount = 0x4028 - aExp;
 4287     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
 4288     return roundAndPackInt32( aSign, aSig0 );
 4289 
 4290 }
 4291 
 4292 /*
 4293 -------------------------------------------------------------------------------
 4294 Returns the result of converting the quadruple-precision floating-point
 4295 value `a' to the 32-bit two's complement integer format.  The conversion
 4296 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4297 Arithmetic, except that the conversion is always rounded toward zero.  If
 4298 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 4299 conversion overflows, the largest integer with the same sign as `a' is
 4300 returned.
 4301 -------------------------------------------------------------------------------
 4302 */
 4303 int32 float128_to_int32_round_to_zero( float128 a )
 4304 {
 4305     flag aSign;
 4306     int32 aExp, shiftCount;
 4307     bits64 aSig0, aSig1, savedASig;
 4308     int32 z;
 4309 
 4310     aSig1 = extractFloat128Frac1( a );
 4311     aSig0 = extractFloat128Frac0( a );
 4312     aExp = extractFloat128Exp( a );
 4313     aSign = extractFloat128Sign( a );
 4314     aSig0 |= ( aSig1 != 0 );
 4315     if ( 0x401E < aExp ) {
 4316         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
 4317         goto invalid;
 4318     }
 4319     else if ( aExp < 0x3FFF ) {
 4320         if ( aExp || aSig0 ) float_set_inexact();
 4321         return 0;
 4322     }
 4323     aSig0 |= LIT64( 0x0001000000000000 );
 4324     shiftCount = 0x402F - aExp;
 4325     savedASig = aSig0;
 4326     aSig0 >>= shiftCount;
 4327     z = aSig0;
 4328     if ( aSign ) z = - z;
 4329     if ( ( z < 0 ) ^ aSign ) {
 4330  invalid:
 4331         float_raise( float_flag_invalid );
 4332         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 4333     }
 4334     if ( ( aSig0<<shiftCount ) != savedASig ) {
 4335         float_set_inexact();
 4336     }
 4337     return z;
 4338 
 4339 }
 4340 
 4341 /*
 4342 -------------------------------------------------------------------------------
 4343 Returns the result of converting the quadruple-precision floating-point
 4344 value `a' to the 64-bit two's complement integer format.  The conversion
 4345 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4346 Arithmetic---which means in particular that the conversion is rounded
 4347 according to the current rounding mode.  If `a' is a NaN, the largest
 4348 positive integer is returned.  Otherwise, if the conversion overflows, the
 4349 largest integer with the same sign as `a' is returned.
 4350 -------------------------------------------------------------------------------
 4351 */
 4352 int64 float128_to_int64( float128 a )
 4353 {
 4354     flag aSign;
 4355     int32 aExp, shiftCount;
 4356     bits64 aSig0, aSig1;
 4357 
 4358     aSig1 = extractFloat128Frac1( a );
 4359     aSig0 = extractFloat128Frac0( a );
 4360     aExp = extractFloat128Exp( a );
 4361     aSign = extractFloat128Sign( a );
 4362     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4363     shiftCount = 0x402F - aExp;
 4364     if ( shiftCount <= 0 ) {
 4365         if ( 0x403E < aExp ) {
 4366             float_raise( float_flag_invalid );
 4367             if (    ! aSign
 4368                  || (    ( aExp == 0x7FFF )
 4369                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
 4370                     )
 4371                ) {
 4372                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 4373             }
 4374             return (sbits64) LIT64( 0x8000000000000000 );
 4375         }
 4376         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
 4377     }
 4378     else {
 4379         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
 4380     }
 4381     return roundAndPackInt64( aSign, aSig0, aSig1 );
 4382 
 4383 }
 4384 
 4385 /*
 4386 -------------------------------------------------------------------------------
 4387 Returns the result of converting the quadruple-precision floating-point
 4388 value `a' to the 64-bit two's complement integer format.  The conversion
 4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4390 Arithmetic, except that the conversion is always rounded toward zero.
 4391 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 4392 the conversion overflows, the largest integer with the same sign as `a' is
 4393 returned.
 4394 -------------------------------------------------------------------------------
 4395 */
 4396 int64 float128_to_int64_round_to_zero( float128 a )
 4397 {
 4398     flag aSign;
 4399     int32 aExp, shiftCount;
 4400     bits64 aSig0, aSig1;
 4401     int64 z;
 4402 
 4403     aSig1 = extractFloat128Frac1( a );
 4404     aSig0 = extractFloat128Frac0( a );
 4405     aExp = extractFloat128Exp( a );
 4406     aSign = extractFloat128Sign( a );
 4407     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4408     shiftCount = aExp - 0x402F;
 4409     if ( 0 < shiftCount ) {
 4410         if ( 0x403E <= aExp ) {
 4411             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
 4412             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
 4413                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
 4414                 if ( aSig1 ) float_set_inexact();
 4415             }
 4416             else {
 4417                 float_raise( float_flag_invalid );
 4418                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
 4419                     return LIT64( 0x7FFFFFFFFFFFFFFF );
 4420                 }
 4421             }
 4422             return (sbits64) LIT64( 0x8000000000000000 );
 4423         }
 4424         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
 4425         if ( (bits64) ( aSig1<<shiftCount ) ) {
 4426             float_set_inexact();
 4427         }
 4428     }
 4429     else {
 4430         if ( aExp < 0x3FFF ) {
 4431             if ( aExp | aSig0 | aSig1 ) {
 4432                 float_set_inexact();
 4433             }
 4434             return 0;
 4435         }
 4436         z = aSig0>>( - shiftCount );
 4437         if (    aSig1
 4438              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
 4439             float_set_inexact();
 4440         }
 4441     }
 4442     if ( aSign ) z = - z;
 4443     return z;
 4444 
 4445 }
 4446 
 4447 /*
 4448 -------------------------------------------------------------------------------
 4449 Returns the result of converting the quadruple-precision floating-point
 4450 value `a' to the single-precision floating-point format.  The conversion
 4451 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4452 Arithmetic.
 4453 -------------------------------------------------------------------------------
 4454 */
 4455 float32 float128_to_float32( float128 a )
 4456 {
 4457     flag aSign;
 4458     int32 aExp;
 4459     bits64 aSig0, aSig1;
 4460     bits32 zSig;
 4461 
 4462     aSig1 = extractFloat128Frac1( a );
 4463     aSig0 = extractFloat128Frac0( a );
 4464     aExp = extractFloat128Exp( a );
 4465     aSign = extractFloat128Sign( a );
 4466     if ( aExp == 0x7FFF ) {
 4467         if ( aSig0 | aSig1 ) {
 4468             return commonNaNToFloat32( float128ToCommonNaN( a ) );
 4469         }
 4470         return packFloat32( aSign, 0xFF, 0 );
 4471     }
 4472     aSig0 |= ( aSig1 != 0 );
 4473     shift64RightJamming( aSig0, 18, &aSig0 );
 4474     zSig = aSig0;
 4475     if ( aExp || zSig ) {
 4476         zSig |= 0x40000000;
 4477         aExp -= 0x3F81;
 4478     }
 4479     return roundAndPackFloat32( aSign, aExp, zSig );
 4480 
 4481 }
 4482 
 4483 /*
 4484 -------------------------------------------------------------------------------
 4485 Returns the result of converting the quadruple-precision floating-point
 4486 value `a' to the double-precision floating-point format.  The conversion
 4487 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4488 Arithmetic.
 4489 -------------------------------------------------------------------------------
 4490 */
 4491 float64 float128_to_float64( float128 a )
 4492 {
 4493     flag aSign;
 4494     int32 aExp;
 4495     bits64 aSig0, aSig1;
 4496 
 4497     aSig1 = extractFloat128Frac1( a );
 4498     aSig0 = extractFloat128Frac0( a );
 4499     aExp = extractFloat128Exp( a );
 4500     aSign = extractFloat128Sign( a );
 4501     if ( aExp == 0x7FFF ) {
 4502         if ( aSig0 | aSig1 ) {
 4503             return commonNaNToFloat64( float128ToCommonNaN( a ) );
 4504         }
 4505         return packFloat64( aSign, 0x7FF, 0 );
 4506     }
 4507     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
 4508     aSig0 |= ( aSig1 != 0 );
 4509     if ( aExp || aSig0 ) {
 4510         aSig0 |= LIT64( 0x4000000000000000 );
 4511         aExp -= 0x3C01;
 4512     }
 4513     return roundAndPackFloat64( aSign, aExp, aSig0 );
 4514 
 4515 }
 4516 
 4517 #ifdef FLOATX80
 4518 
 4519 /*
 4520 -------------------------------------------------------------------------------
 4521 Returns the result of converting the quadruple-precision floating-point
 4522 value `a' to the extended double-precision floating-point format.  The
 4523 conversion is performed according to the IEC/IEEE Standard for Binary
 4524 Floating-Point Arithmetic.
 4525 -------------------------------------------------------------------------------
 4526 */
 4527 floatx80 float128_to_floatx80( float128 a )
 4528 {
 4529     flag aSign;
 4530     int32 aExp;
 4531     bits64 aSig0, aSig1;
 4532 
 4533     aSig1 = extractFloat128Frac1( a );
 4534     aSig0 = extractFloat128Frac0( a );
 4535     aExp = extractFloat128Exp( a );
 4536     aSign = extractFloat128Sign( a );
 4537     if ( aExp == 0x7FFF ) {
 4538         if ( aSig0 | aSig1 ) {
 4539             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
 4540         }
 4541         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 4542     }
 4543     if ( aExp == 0 ) {
 4544         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
 4545         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4546     }
 4547     else {
 4548         aSig0 |= LIT64( 0x0001000000000000 );
 4549     }
 4550     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
 4551     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
 4552 
 4553 }
 4554 
 4555 #endif
 4556 
 4557 /*
 4558 -------------------------------------------------------------------------------
 4559 Rounds the quadruple-precision floating-point value `a' to an integer, and
 4560 returns the result as a quadruple-precision floating-point value.  The
 4561 operation is performed according to the IEC/IEEE Standard for Binary
 4562 Floating-Point Arithmetic.
 4563 -------------------------------------------------------------------------------
 4564 */
 4565 float128 float128_round_to_int( float128 a )
 4566 {
 4567     flag aSign;
 4568     int32 aExp;
 4569     bits64 lastBitMask, roundBitsMask;
 4570     int8 roundingMode;
 4571     float128 z;
 4572 
 4573     aExp = extractFloat128Exp( a );
 4574     if ( 0x402F <= aExp ) {
 4575         if ( 0x406F <= aExp ) {
 4576             if (    ( aExp == 0x7FFF )
 4577                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
 4578                ) {
 4579                 return propagateFloat128NaN( a, a );
 4580             }
 4581             return a;
 4582         }
 4583         lastBitMask = 1;
 4584         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
 4585         roundBitsMask = lastBitMask - 1;
 4586         z = a;
 4587         roundingMode = float_rounding_mode();
 4588         if ( roundingMode == float_round_nearest_even ) {
 4589             if ( lastBitMask ) {
 4590                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
 4591                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
 4592             }
 4593             else {
 4594                 if ( (sbits64) z.low < 0 ) {
 4595                     ++z.high;
 4596                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
 4597                 }
 4598             }
 4599         }
 4600         else if ( roundingMode != float_round_to_zero ) {
 4601             if (   extractFloat128Sign( z )
 4602                  ^ ( roundingMode == float_round_up ) ) {
 4603                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
 4604             }
 4605         }
 4606         z.low &= ~ roundBitsMask;
 4607     }
 4608     else {
 4609         if ( aExp < 0x3FFF ) {
 4610             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
 4611             float_set_inexact();
 4612             aSign = extractFloat128Sign( a );
 4613             switch ( float_rounding_mode() ) {
 4614              case float_round_nearest_even:
 4615                 if (    ( aExp == 0x3FFE )
 4616                      && (   extractFloat128Frac0( a )
 4617                           | extractFloat128Frac1( a ) )
 4618                    ) {
 4619                     return packFloat128( aSign, 0x3FFF, 0, 0 );
 4620                 }
 4621                 break;
 4622              case float_round_down:
 4623                 return
 4624                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
 4625                     : packFloat128( 0, 0, 0, 0 );
 4626              case float_round_up:
 4627                 return
 4628                       aSign ? packFloat128( 1, 0, 0, 0 )
 4629                     : packFloat128( 0, 0x3FFF, 0, 0 );
 4630             }
 4631             return packFloat128( aSign, 0, 0, 0 );
 4632         }
 4633         lastBitMask = 1;
 4634         lastBitMask <<= 0x402F - aExp;
 4635         roundBitsMask = lastBitMask - 1;
 4636         z.low = 0;
 4637         z.high = a.high;
 4638         roundingMode = float_rounding_mode();
 4639         if ( roundingMode == float_round_nearest_even ) {
 4640             z.high += lastBitMask>>1;
 4641             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
 4642                 z.high &= ~ lastBitMask;
 4643             }
 4644         }
 4645         else if ( roundingMode != float_round_to_zero ) {
 4646             if (   extractFloat128Sign( z )
 4647                  ^ ( roundingMode == float_round_up ) ) {
 4648                 z.high |= ( a.low != 0 );
 4649                 z.high += roundBitsMask;
 4650             }
 4651         }
 4652         z.high &= ~ roundBitsMask;
 4653     }
 4654     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
 4655         float_set_inexact();
 4656     }
 4657     return z;
 4658 
 4659 }
 4660 
 4661 /*
 4662 -------------------------------------------------------------------------------
 4663 Returns the result of adding the absolute values of the quadruple-precision
 4664 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 4665 before being returned.  `zSign' is ignored if the result is a NaN.
 4666 The addition is performed according to the IEC/IEEE Standard for Binary
 4667 Floating-Point Arithmetic.
 4668 -------------------------------------------------------------------------------
 4669 */
 4670 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
 4671 {
 4672     int32 aExp, bExp, zExp;
 4673     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
 4674     int32 expDiff;
 4675 
 4676     aSig1 = extractFloat128Frac1( a );
 4677     aSig0 = extractFloat128Frac0( a );
 4678     aExp = extractFloat128Exp( a );
 4679     bSig1 = extractFloat128Frac1( b );
 4680     bSig0 = extractFloat128Frac0( b );
 4681     bExp = extractFloat128Exp( b );
 4682     expDiff = aExp - bExp;
 4683     if ( 0 < expDiff ) {
 4684         if ( aExp == 0x7FFF ) {
 4685             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4686             return a;
 4687         }
 4688         if ( bExp == 0 ) {
 4689             --expDiff;
 4690         }
 4691         else {
 4692             bSig0 |= LIT64( 0x0001000000000000 );
 4693         }
 4694         shift128ExtraRightJamming(
 4695             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
 4696         zExp = aExp;
 4697     }
 4698     else if ( expDiff < 0 ) {
 4699         if ( bExp == 0x7FFF ) {
 4700             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4701             return packFloat128( zSign, 0x7FFF, 0, 0 );
 4702         }
 4703         if ( aExp == 0 ) {
 4704             ++expDiff;
 4705         }
 4706         else {
 4707             aSig0 |= LIT64( 0x0001000000000000 );
 4708         }
 4709         shift128ExtraRightJamming(
 4710             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
 4711         zExp = bExp;
 4712     }
 4713     else {
 4714         if ( aExp == 0x7FFF ) {
 4715             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
 4716                 return propagateFloat128NaN( a, b );
 4717             }
 4718             return a;
 4719         }
 4720         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4721         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
 4722         zSig2 = 0;
 4723         zSig0 |= LIT64( 0x0002000000000000 );
 4724         zExp = aExp;
 4725         goto shiftRight1;
 4726     }
 4727     aSig0 |= LIT64( 0x0001000000000000 );
 4728     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4729     --zExp;
 4730     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
 4731     ++zExp;
 4732  shiftRight1:
 4733     shift128ExtraRightJamming(
 4734         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
 4735  roundAndPack:
 4736     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 4737 
 4738 }
 4739 
 4740 /*
 4741 -------------------------------------------------------------------------------
 4742 Returns the result of subtracting the absolute values of the quadruple-
 4743 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 4744 difference is negated before being returned.  `zSign' is ignored if the
 4745 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 4746 Standard for Binary Floating-Point Arithmetic.
 4747 -------------------------------------------------------------------------------
 4748 */
 4749 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
 4750 {
 4751     int32 aExp, bExp, zExp;
 4752     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
 4753     int32 expDiff;
 4754     float128 z;
 4755 
 4756     aSig1 = extractFloat128Frac1( a );
 4757     aSig0 = extractFloat128Frac0( a );
 4758     aExp = extractFloat128Exp( a );
 4759     bSig1 = extractFloat128Frac1( b );
 4760     bSig0 = extractFloat128Frac0( b );
 4761     bExp = extractFloat128Exp( b );
 4762     expDiff = aExp - bExp;
 4763     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
 4764     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
 4765     if ( 0 < expDiff ) goto aExpBigger;
 4766     if ( expDiff < 0 ) goto bExpBigger;
 4767     if ( aExp == 0x7FFF ) {
 4768         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
 4769             return propagateFloat128NaN( a, b );
 4770         }
 4771         float_raise( float_flag_invalid );
 4772         z.low = float128_default_nan_low;
 4773         z.high = float128_default_nan_high;
 4774         return z;
 4775     }
 4776     if ( aExp == 0 ) {
 4777         aExp = 1;
 4778         bExp = 1;
 4779     }
 4780     if ( bSig0 < aSig0 ) goto aBigger;
 4781     if ( aSig0 < bSig0 ) goto bBigger;
 4782     if ( bSig1 < aSig1 ) goto aBigger;
 4783     if ( aSig1 < bSig1 ) goto bBigger;
 4784     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
 4785  bExpBigger:
 4786     if ( bExp == 0x7FFF ) {
 4787         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4788         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
 4789     }
 4790     if ( aExp == 0 ) {
 4791         ++expDiff;
 4792     }
 4793     else {
 4794         aSig0 |= LIT64( 0x4000000000000000 );
 4795     }
 4796     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
 4797     bSig0 |= LIT64( 0x4000000000000000 );
 4798  bBigger:
 4799     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
 4800     zExp = bExp;
 4801     zSign ^= 1;
 4802     goto normalizeRoundAndPack;
 4803  aExpBigger:
 4804     if ( aExp == 0x7FFF ) {
 4805         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4806         return a;
 4807     }
 4808     if ( bExp == 0 ) {
 4809         --expDiff;
 4810     }
 4811     else {
 4812         bSig0 |= LIT64( 0x4000000000000000 );
 4813     }
 4814     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
 4815     aSig0 |= LIT64( 0x4000000000000000 );
 4816  aBigger:
 4817     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4818     zExp = aExp;
 4819  normalizeRoundAndPack:
 4820     --zExp;
 4821     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
 4822 
 4823 }
 4824 
 4825 /*
 4826 -------------------------------------------------------------------------------
 4827 Returns the result of adding the quadruple-precision floating-point values
 4828 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 4829 for Binary Floating-Point Arithmetic.
 4830 -------------------------------------------------------------------------------
 4831 */
 4832 float128 float128_add( float128 a, float128 b )
 4833 {
 4834     flag aSign, bSign;
 4835 
 4836     aSign = extractFloat128Sign( a );
 4837     bSign = extractFloat128Sign( b );
 4838     if ( aSign == bSign ) {
 4839         return addFloat128Sigs( a, b, aSign );
 4840     }
 4841     else {
 4842         return subFloat128Sigs( a, b, aSign );
 4843     }
 4844 
 4845 }
 4846 
 4847 /*
 4848 -------------------------------------------------------------------------------
 4849 Returns the result of subtracting the quadruple-precision floating-point
 4850 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 4851 Standard for Binary Floating-Point Arithmetic.
 4852 -------------------------------------------------------------------------------
 4853 */
 4854 float128 float128_sub( float128 a, float128 b )
 4855 {
 4856     flag aSign, bSign;
 4857 
 4858     aSign = extractFloat128Sign( a );
 4859     bSign = extractFloat128Sign( b );
 4860     if ( aSign == bSign ) {
 4861         return subFloat128Sigs( a, b, aSign );
 4862     }
 4863     else {
 4864         return addFloat128Sigs( a, b, aSign );
 4865     }
 4866 
 4867 }
 4868 
 4869 /*
 4870 -------------------------------------------------------------------------------
 4871 Returns the result of multiplying the quadruple-precision floating-point
 4872 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 4873 Standard for Binary Floating-Point Arithmetic.
 4874 -------------------------------------------------------------------------------
 4875 */
 4876 float128 float128_mul( float128 a, float128 b )
 4877 {
 4878     flag aSign, bSign, zSign;
 4879     int32 aExp, bExp, zExp;
 4880     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
 4881     float128 z;
 4882 
 4883     aSig1 = extractFloat128Frac1( a );
 4884     aSig0 = extractFloat128Frac0( a );
 4885     aExp = extractFloat128Exp( a );
 4886     aSign = extractFloat128Sign( a );
 4887     bSig1 = extractFloat128Frac1( b );
 4888     bSig0 = extractFloat128Frac0( b );
 4889     bExp = extractFloat128Exp( b );
 4890     bSign = extractFloat128Sign( b );
 4891     zSign = aSign ^ bSign;
 4892     if ( aExp == 0x7FFF ) {
 4893         if (    ( aSig0 | aSig1 )
 4894              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
 4895             return propagateFloat128NaN( a, b );
 4896         }
 4897         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
 4898         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4899     }
 4900     if ( bExp == 0x7FFF ) {
 4901         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4902         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
 4903  invalid:
 4904             float_raise( float_flag_invalid );
 4905             z.low = float128_default_nan_low;
 4906             z.high = float128_default_nan_high;
 4907             return z;
 4908         }
 4909         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4910     }
 4911     if ( aExp == 0 ) {
 4912         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4913         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4914     }
 4915     if ( bExp == 0 ) {
 4916         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4917         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 4918     }
 4919     zExp = aExp + bExp - 0x4000;
 4920     aSig0 |= LIT64( 0x0001000000000000 );
 4921     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
 4922     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
 4923     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
 4924     zSig2 |= ( zSig3 != 0 );
 4925     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
 4926         shift128ExtraRightJamming(
 4927             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
 4928         ++zExp;
 4929     }
 4930     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 4931 
 4932 }
 4933 
 4934 /*
 4935 -------------------------------------------------------------------------------
 4936 Returns the result of dividing the quadruple-precision floating-point value
 4937 `a' by the corresponding value `b'.  The operation is performed according to
 4938 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4939 -------------------------------------------------------------------------------
 4940 */
 4941 float128 float128_div( float128 a, float128 b )
 4942 {
 4943     flag aSign, bSign, zSign;
 4944     int32 aExp, bExp, zExp;
 4945     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
 4946     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 4947     float128 z;
 4948 
 4949     aSig1 = extractFloat128Frac1( a );
 4950     aSig0 = extractFloat128Frac0( a );
 4951     aExp = extractFloat128Exp( a );
 4952     aSign = extractFloat128Sign( a );
 4953     bSig1 = extractFloat128Frac1( b );
 4954     bSig0 = extractFloat128Frac0( b );
 4955     bExp = extractFloat128Exp( b );
 4956     bSign = extractFloat128Sign( b );
 4957     zSign = aSign ^ bSign;
 4958     if ( aExp == 0x7FFF ) {
 4959         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4960         if ( bExp == 0x7FFF ) {
 4961             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4962             goto invalid;
 4963         }
 4964         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4965     }
 4966     if ( bExp == 0x7FFF ) {
 4967         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4968         return packFloat128( zSign, 0, 0, 0 );
 4969     }
 4970     if ( bExp == 0 ) {
 4971         if ( ( bSig0 | bSig1 ) == 0 ) {
 4972             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
 4973  invalid:
 4974                 float_raise( float_flag_invalid );
 4975                 z.low = float128_default_nan_low;
 4976                 z.high = float128_default_nan_high;
 4977                 return z;
 4978             }
 4979             float_raise( float_flag_divbyzero );
 4980             return packFloat128( zSign, 0x7FFF, 0, 0 );
 4981         }
 4982         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 4983     }
 4984     if ( aExp == 0 ) {
 4985         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4986         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4987     }
 4988     zExp = aExp - bExp + 0x3FFD;
 4989     shortShift128Left(
 4990         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
 4991     shortShift128Left(
 4992         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
 4993     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
 4994         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
 4995         ++zExp;
 4996     }
 4997     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
 4998     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
 4999     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
 5000     while ( (sbits64) rem0 < 0 ) {
 5001         --zSig0;
 5002         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
 5003     }
 5004     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
 5005     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
 5006         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
 5007         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
 5008         while ( (sbits64) rem1 < 0 ) {
 5009             --zSig1;
 5010             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
 5011         }
 5012         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 5013     }
 5014     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
 5015     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 5016 
 5017 }
 5018 
 5019 /*
 5020 -------------------------------------------------------------------------------
 5021 Returns the remainder of the quadruple-precision floating-point value `a'
 5022 with respect to the corresponding value `b'.  The operation is performed
 5023 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5024 -------------------------------------------------------------------------------
 5025 */
 5026 float128 float128_rem( float128 a, float128 b )
 5027 {
 5028     flag aSign, bSign, zSign;
 5029     int32 aExp, bExp, expDiff;
 5030     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
 5031     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
 5032     sbits64 sigMean0;
 5033     float128 z;
 5034 
 5035     aSig1 = extractFloat128Frac1( a );
 5036     aSig0 = extractFloat128Frac0( a );
 5037     aExp = extractFloat128Exp( a );
 5038     aSign = extractFloat128Sign( a );
 5039     bSig1 = extractFloat128Frac1( b );
 5040     bSig0 = extractFloat128Frac0( b );
 5041     bExp = extractFloat128Exp( b );
 5042     bSign = extractFloat128Sign( b );
 5043     if ( aExp == 0x7FFF ) {
 5044         if (    ( aSig0 | aSig1 )
 5045              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
 5046             return propagateFloat128NaN( a, b );
 5047         }
 5048         goto invalid;
 5049     }
 5050     if ( bExp == 0x7FFF ) {
 5051         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 5052         return a;
 5053     }
 5054     if ( bExp == 0 ) {
 5055         if ( ( bSig0 | bSig1 ) == 0 ) {
 5056  invalid:
 5057             float_raise( float_flag_invalid );
 5058             z.low = float128_default_nan_low;
 5059             z.high = float128_default_nan_high;
 5060             return z;
 5061         }
 5062         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 5063     }
 5064     if ( aExp == 0 ) {
 5065         if ( ( aSig0 | aSig1 ) == 0 ) return a;
 5066         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 5067     }
 5068     expDiff = aExp - bExp;
 5069     if ( expDiff < -1 ) return a;
 5070     shortShift128Left(
 5071         aSig0 | LIT64( 0x0001000000000000 ),
 5072         aSig1,
 5073         15 - ( expDiff < 0 ),
 5074         &aSig0,
 5075         &aSig1
 5076     );
 5077     shortShift128Left(
 5078         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
 5079     q = le128( bSig0, bSig1, aSig0, aSig1 );
 5080     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
 5081     expDiff -= 64;
 5082     while ( 0 < expDiff ) {
 5083         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
 5084         q = ( 4 < q ) ? q - 4 : 0;
 5085         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
 5086         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
 5087         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
 5088         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
 5089         expDiff -= 61;
 5090     }
 5091     if ( -64 < expDiff ) {
 5092         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
 5093         q = ( 4 < q ) ? q - 4 : 0;
 5094         q >>= - expDiff;
 5095         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
 5096         expDiff += 52;
 5097         if ( expDiff < 0 ) {
 5098             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
 5099         }
 5100         else {
 5101             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
 5102         }
 5103         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
 5104         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
 5105     }
 5106     else {
 5107         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
 5108         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
 5109     }
 5110     do {
 5111         alternateASig0 = aSig0;
 5112         alternateASig1 = aSig1;
 5113         ++q;
 5114         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
 5115     } while ( 0 <= (sbits64) aSig0 );
 5116     add128(
 5117         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
 5118     if (    ( sigMean0 < 0 )
 5119          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
 5120         aSig0 = alternateASig0;
 5121         aSig1 = alternateASig1;
 5122     }
 5123     zSign = ( (sbits64) aSig0 < 0 );
 5124     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
 5125     return
 5126         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
 5127 
 5128 }
 5129 
 5130 /*
 5131 -------------------------------------------------------------------------------
 5132 Returns the square root of the quadruple-precision floating-point value `a'.
 5133 The operation is performed according to the IEC/IEEE Standard for Binary
 5134 Floating-Point Arithmetic.
 5135 -------------------------------------------------------------------------------
 5136 */
 5137 float128 float128_sqrt( float128 a )
 5138 {
 5139     flag aSign;
 5140     int32 aExp, zExp;
 5141     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
 5142     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 5143     float128 z;
 5144 
 5145     aSig1 = extractFloat128Frac1( a );
 5146     aSig0 = extractFloat128Frac0( a );
 5147     aExp = extractFloat128Exp( a );
 5148     aSign = extractFloat128Sign( a );
 5149     if ( aExp == 0x7FFF ) {
 5150         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
 5151         if ( ! aSign ) return a;
 5152         goto invalid;
 5153     }
 5154     if ( aSign ) {
 5155         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
 5156  invalid:
 5157         float_raise( float_flag_invalid );
 5158         z.low = float128_default_nan_low;
 5159         z.high = float128_default_nan_high;
 5160         return z;
 5161     }
 5162     if ( aExp == 0 ) {
 5163         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
 5164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 5165     }
 5166     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
 5167     aSig0 |= LIT64( 0x0001000000000000 );
 5168     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
 5169     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
 5170     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
 5171     doubleZSig0 = zSig0<<1;
 5172     mul64To128( zSig0, zSig0, &term0, &term1 );
 5173     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
 5174     while ( (sbits64) rem0 < 0 ) {
 5175         --zSig0;
 5176         doubleZSig0 -= 2;
 5177         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
 5178     }
 5179     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
 5180     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
 5181         if ( zSig1 == 0 ) zSig1 = 1;
 5182         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
 5183         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 5184         mul64To128( zSig1, zSig1, &term2, &term3 );
 5185         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
 5186         while ( (sbits64) rem1 < 0 ) {
 5187             --zSig1;
 5188             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
 5189             term3 |= 1;
 5190             term2 |= doubleZSig0;
 5191             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
 5192         }
 5193         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 5194     }
 5195     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
 5196     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
 5197 
 5198 }
 5199 
 5200 /*
 5201 -------------------------------------------------------------------------------
 5202 Returns 1 if the quadruple-precision floating-point value `a' is equal to
 5203 the corresponding value `b', and 0 otherwise.  The comparison is performed
 5204 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5205 -------------------------------------------------------------------------------
 5206 */
 5207 flag float128_eq( float128 a, float128 b )
 5208 {
 5209 
 5210     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5211               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5212          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5213               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5214        ) {
 5215         if (    float128_is_signaling_nan( a )
 5216              || float128_is_signaling_nan( b ) ) {
 5217             float_raise( float_flag_invalid );
 5218         }
 5219         return 0;
 5220     }
 5221     return
 5222            ( a.low == b.low )
 5223         && (    ( a.high == b.high )
 5224              || (    ( a.low == 0 )
 5225                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
 5226            );
 5227 
 5228 }
 5229 
 5230 /*
 5231 -------------------------------------------------------------------------------
 5232 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5233 or equal to the corresponding value `b', and 0 otherwise.  The comparison
 5234 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 5235 Arithmetic.
 5236 -------------------------------------------------------------------------------
 5237 */
 5238 flag float128_le( float128 a, float128 b )
 5239 {
 5240     flag aSign, bSign;
 5241 
 5242     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5243               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5244          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5245               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5246        ) {
 5247         float_raise( float_flag_invalid );
 5248         return 0;
 5249     }
 5250     aSign = extractFloat128Sign( a );
 5251     bSign = extractFloat128Sign( b );
 5252     if ( aSign != bSign ) {
 5253         return
 5254                aSign
 5255             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5256                  == 0 );
 5257     }
 5258     return
 5259           aSign ? le128( b.high, b.low, a.high, a.low )
 5260         : le128( a.high, a.low, b.high, b.low );
 5261 
 5262 }
 5263 
 5264 /*
 5265 -------------------------------------------------------------------------------
 5266 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5267 the corresponding value `b', and 0 otherwise.  The comparison is performed
 5268 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5269 -------------------------------------------------------------------------------
 5270 */
 5271 flag float128_lt( float128 a, float128 b )
 5272 {
 5273     flag aSign, bSign;
 5274 
 5275     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5276               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5277          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5278               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5279        ) {
 5280         float_raise( float_flag_invalid );
 5281         return 0;
 5282     }
 5283     aSign = extractFloat128Sign( a );
 5284     bSign = extractFloat128Sign( b );
 5285     if ( aSign != bSign ) {
 5286         return
 5287                aSign
 5288             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5289                  != 0 );
 5290     }
 5291     return
 5292           aSign ? lt128( b.high, b.low, a.high, a.low )
 5293         : lt128( a.high, a.low, b.high, b.low );
 5294 
 5295 }
 5296 
 5297 /*
 5298 -------------------------------------------------------------------------------
 5299 Returns 1 if the quadruple-precision floating-point value `a' is equal to
 5300 the corresponding value `b', and 0 otherwise.  The invalid exception is
 5301 raised if either operand is a NaN.  Otherwise, the comparison is performed
 5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5303 -------------------------------------------------------------------------------
 5304 */
 5305 flag float128_eq_signaling( float128 a, float128 b )
 5306 {
 5307 
 5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5312        ) {
 5313         float_raise( float_flag_invalid );
 5314         return 0;
 5315     }
 5316     return
 5317            ( a.low == b.low )
 5318         && (    ( a.high == b.high )
 5319              || (    ( a.low == 0 )
 5320                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
 5321            );
 5322 
 5323 }
 5324 
 5325 /*
 5326 -------------------------------------------------------------------------------
 5327 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5328 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 5329 cause an exception.  Otherwise, the comparison is performed according to the
 5330 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5331 -------------------------------------------------------------------------------
 5332 */
 5333 flag float128_le_quiet( float128 a, float128 b )
 5334 {
 5335     flag aSign, bSign;
 5336 
 5337     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5338               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5339          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5340               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5341        ) {
 5342         if (    float128_is_signaling_nan( a )
 5343              || float128_is_signaling_nan( b ) ) {
 5344             float_raise( float_flag_invalid );
 5345         }
 5346         return 0;
 5347     }
 5348     aSign = extractFloat128Sign( a );
 5349     bSign = extractFloat128Sign( b );
 5350     if ( aSign != bSign ) {
 5351         return
 5352                aSign
 5353             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5354                  == 0 );
 5355     }
 5356     return
 5357           aSign ? le128( b.high, b.low, a.high, a.low )
 5358         : le128( a.high, a.low, b.high, b.low );
 5359 
 5360 }
 5361 
 5362 /*
 5363 -------------------------------------------------------------------------------
 5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5365 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 5366 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 5367 Standard for Binary Floating-Point Arithmetic.
 5368 -------------------------------------------------------------------------------
 5369 */
 5370 flag float128_lt_quiet( float128 a, float128 b )
 5371 {
 5372     flag aSign, bSign;
 5373 
 5374     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5375               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5376          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5377               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5378        ) {
 5379         if (    float128_is_signaling_nan( a )
 5380              || float128_is_signaling_nan( b ) ) {
 5381             float_raise( float_flag_invalid );
 5382         }
 5383         return 0;
 5384     }
 5385     aSign = extractFloat128Sign( a );
 5386     bSign = extractFloat128Sign( b );
 5387     if ( aSign != bSign ) {
 5388         return
 5389                aSign
 5390             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5391                  != 0 );
 5392     }
 5393     return
 5394           aSign ? lt128( b.high, b.low, a.high, a.low )
 5395         : lt128( a.high, a.low, b.high, b.low );
 5396 
 5397 }
 5398 
 5399 #endif
 5400 
 5401 
 5402 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
 5403 
 5404 /*
 5405  * These two routines are not part of the original softfloat distribution.
 5406  *
 5407  * They are based on the corresponding conversions to integer but return
 5408  * unsigned numbers instead since these functions are required by GCC.
 5409  *
 5410  * Added by Mark Brinicombe <mark@netbsd.org>   27/09/97
 5411  *
 5412  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
 5413  */
 5414 
 5415 /*
 5416 -------------------------------------------------------------------------------
 5417 Returns the result of converting the double-precision floating-point value
 5418 `a' to the 32-bit unsigned integer format.  The conversion is
 5419 performed according to the IEC/IEEE Standard for Binary Floating-point
 5420 Arithmetic, except that the conversion is always rounded toward zero.  If
 5421 `a' is a NaN, the largest positive integer is returned.  If the conversion
 5422 overflows, the largest integer positive is returned.
 5423 -------------------------------------------------------------------------------
 5424 */
 5425 uint32 float64_to_uint32_round_to_zero( float64 a )
 5426 {
 5427     flag aSign;
 5428     int16 aExp, shiftCount;
 5429     bits64 aSig, savedASig;
 5430     uint32 z;
 5431 
 5432     aSig = extractFloat64Frac( a );
 5433     aExp = extractFloat64Exp( a );
 5434     aSign = extractFloat64Sign( a );
 5435 
 5436     if (aSign) {
 5437         float_raise( float_flag_invalid );
 5438         return(0);
 5439     }
 5440 
 5441     if ( 0x41E < aExp ) {
 5442         float_raise( float_flag_invalid );
 5443         return 0xffffffff;
 5444     }
 5445     else if ( aExp < 0x3FF ) {
 5446         if ( aExp || aSig ) float_set_inexact();
 5447         return 0;
 5448     }
 5449     aSig |= LIT64( 0x0010000000000000 );
 5450     shiftCount = 0x433 - aExp;
 5451     savedASig = aSig;
 5452     aSig >>= shiftCount;
 5453     z = aSig;
 5454     if ( ( aSig<<shiftCount ) != savedASig ) {
 5455         float_set_inexact();
 5456     }
 5457     return z;
 5458 
 5459 }
 5460 
 5461 /*
 5462 -------------------------------------------------------------------------------
 5463 Returns the result of converting the single-precision floating-point value
 5464 `a' to the 32-bit unsigned integer format.  The conversion is
 5465 performed according to the IEC/IEEE Standard for Binary Floating-point
 5466 Arithmetic, except that the conversion is always rounded toward zero.  If
 5467 `a' is a NaN, the largest positive integer is returned.  If the conversion
 5468 overflows, the largest positive integer is returned.
 5469 -------------------------------------------------------------------------------
 5470 */
 5471 uint32 float32_to_uint32_round_to_zero( float32 a )
 5472 {
 5473     flag aSign;
 5474     int16 aExp, shiftCount;
 5475     bits32 aSig;
 5476     uint32 z;
 5477 
 5478     aSig = extractFloat32Frac( a );
 5479     aExp = extractFloat32Exp( a );
 5480     aSign = extractFloat32Sign( a );
 5481     shiftCount = aExp - 0x9E;
 5482 
 5483     if (aSign) {
 5484         float_raise( float_flag_invalid );
 5485         return(0);
 5486     }
 5487     if ( 0 < shiftCount ) {
 5488         float_raise( float_flag_invalid );
 5489         return 0xFFFFFFFF;
 5490     }
 5491     else if ( aExp <= 0x7E ) {
 5492         if ( aExp | aSig ) float_set_inexact();
 5493         return 0;
 5494     }
 5495     aSig = ( aSig | 0x800000 )<<8;
 5496     z = aSig>>( - shiftCount );
 5497     if ( aSig<<( shiftCount & 31 ) ) {
 5498         float_set_inexact();
 5499     }
 5500     return z;
 5501 
 5502 }
 5503 
 5504 #endif
 5505 
 5506 #endif /* !NO_IEEE */
/* [<][>][^][v][top][bottom][index][help] */
root/lib/libkern/softfloat.c

DEFINITIONS