git.maemo.org Git - qemu/blob - fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 | input.  If `zSign' is 1, the input is negated before being converted to an
  73 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74 | is simply rounded to an integer, with the inexact exception raised if the
  75 | input cannot be represented exactly as an integer.  However, if the fixed-
  76 | point input is too large, the invalid exception is raised and the largest
  77 | positive or negative integer is returned.
  78 *----------------------------------------------------------------------------*/
  79
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = STATUS(float_rounding_mode);
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_raise( float_flag_invalid STATUS_VAR);
 111         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 121 | and returns the properly rounded 64-bit integer corresponding to the input.
 122 | If `zSign' is 1, the input is negated before being converted to an integer.
 123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 124 | the inexact exception raised if the input cannot be represented exactly as
 125 | an integer.  However, if the fixed-point input is too large, the invalid
 126 | exception is raised and the largest positive or negative integer is
 127 | returned.
 128 *----------------------------------------------------------------------------*/
 129
 130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131 {
 132     int8 roundingMode;
 133     flag roundNearestEven, increment;
 134     int64 z;
 135
 136     roundingMode = STATUS(float_rounding_mode);
 137     roundNearestEven = ( roundingMode == float_round_nearest_even );
 138     increment = ( (sbits64) absZ1 < 0 );
 139     if ( ! roundNearestEven ) {
 140         if ( roundingMode == float_round_to_zero ) {
 141             increment = 0;
 142         }
 143         else {
 144             if ( zSign ) {
 145                 increment = ( roundingMode == float_round_down ) && absZ1;
 146             }
 147             else {
 148                 increment = ( roundingMode == float_round_up ) && absZ1;
 149             }
 150         }
 151     }
 152     if ( increment ) {
 153         ++absZ0;
 154         if ( absZ0 == 0 ) goto overflow;
 155         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156     }
 157     z = absZ0;
 158     if ( zSign ) z = - z;
 159     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160  overflow:
 161         float_raise( float_flag_invalid STATUS_VAR);
 162         return
 163               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164             : LIT64( 0x7FFFFFFFFFFFFFFF );
 165     }
 166     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167     return z;
 168
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the fraction bits of the single-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 INLINE bits32 extractFloat32Frac( float32 a )
 176 {
 177
 178     return a & 0x007FFFFF;
 179
 180 }
 181
 182 /*----------------------------------------------------------------------------
 183 | Returns the exponent bits of the single-precision floating-point value `a'.
 184 *----------------------------------------------------------------------------*/
 185
 186 INLINE int16 extractFloat32Exp( float32 a )
 187 {
 188
 189     return ( a>>23 ) & 0xFF;
 190
 191 }
 192
 193 /*----------------------------------------------------------------------------
 194 | Returns the sign bit of the single-precision floating-point value `a'.
 195 *----------------------------------------------------------------------------*/
 196
 197 INLINE flag extractFloat32Sign( float32 a )
 198 {
 199
 200     return a>>31;
 201
 202 }
 203
 204 /*----------------------------------------------------------------------------
 205 | Normalizes the subnormal single-precision floating-point value represented
 206 | by the denormalized significand `aSig'.  The normalized exponent and
 207 | significand are stored at the locations pointed to by `zExpPtr' and
 208 | `zSigPtr', respectively.
 209 *----------------------------------------------------------------------------*/
 210
 211 static void
 212  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 213 {
 214     int8 shiftCount;
 215
 216     shiftCount = countLeadingZeros32( aSig ) - 8;
 217     *zSigPtr = aSig<<shiftCount;
 218     *zExpPtr = 1 - shiftCount;
 219
 220 }
 221
 222 /*----------------------------------------------------------------------------
 223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 224 | single-precision floating-point value, returning the result.  After being
 225 | shifted into the proper positions, the three fields are simply added
 226 | together to form the result.  This means that any integer portion of `zSig'
 227 | will be added into the exponent.  Since a properly normalized significand
 228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 229 | than the desired result exponent whenever `zSig' is a complete, normalized
 230 | significand.
 231 *----------------------------------------------------------------------------*/
 232
 233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 234 {
 235
 236     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 237
 238 }
 239
 240 /*----------------------------------------------------------------------------
 241 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 242 | and significand `zSig', and returns the proper single-precision floating-
 243 | point value corresponding to the abstract input.  Ordinarily, the abstract
 244 | value is simply rounded and packed into the single-precision format, with
 245 | the inexact exception raised if the abstract input cannot be represented
 246 | exactly.  However, if the abstract value is too large, the overflow and
 247 | inexact exceptions are raised and an infinity or maximal finite value is
 248 | returned.  If the abstract value is too small, the input value is rounded to
 249 | a subnormal number, and the underflow and inexact exceptions are raised if
 250 | the abstract input cannot be represented exactly as a subnormal single-
 251 | precision floating-point number.
 252 |     The input significand `zSig' has its binary point between bits 30
 253 | and 29, which is 7 bits to the left of the usual location.  This shifted
 254 | significand must be normalized or smaller.  If `zSig' is not normalized,
 255 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 256 | and it must not require rounding.  In the usual case that `zSig' is
 257 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 258 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 259 | Binary Floating-Point Arithmetic.
 260 *----------------------------------------------------------------------------*/
 261
 262 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 263 {
 264     int8 roundingMode;
 265     flag roundNearestEven;
 266     int8 roundIncrement, roundBits;
 267     flag isTiny;
 268
 269     roundingMode = STATUS(float_rounding_mode);
 270     roundNearestEven = ( roundingMode == float_round_nearest_even );
 271     roundIncrement = 0x40;
 272     if ( ! roundNearestEven ) {
 273         if ( roundingMode == float_round_to_zero ) {
 274             roundIncrement = 0;
 275         }
 276         else {
 277             roundIncrement = 0x7F;
 278             if ( zSign ) {
 279                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 280             }
 281             else {
 282                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 283             }
 284         }
 285     }
 286     roundBits = zSig & 0x7F;
 287     if ( 0xFD <= (bits16) zExp ) {
 288         if (    ( 0xFD < zExp )
 289              || (    ( zExp == 0xFD )
 290                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 291            ) {
 292             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 293             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 294         }
 295         if ( zExp < 0 ) {
 296             isTiny =
 297                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 298                 || ( zExp < -1 )
 299                 || ( zSig + roundIncrement < 0x80000000 );
 300             shift32RightJamming( zSig, - zExp, &zSig );
 301             zExp = 0;
 302             roundBits = zSig & 0x7F;
 303             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 304         }
 305     }
 306     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 307     zSig = ( zSig + roundIncrement )>>7;
 308     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 309     if ( zSig == 0 ) zExp = 0;
 310     return packFloat32( zSign, zExp, zSig );
 311
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 316 | and significand `zSig', and returns the proper single-precision floating-
 317 | point value corresponding to the abstract input.  This routine is just like
 318 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 319 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 320 | floating-point exponent.
 321 *----------------------------------------------------------------------------*/
 322
 323 static float32
 324  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 325 {
 326     int8 shiftCount;
 327
 328     shiftCount = countLeadingZeros32( zSig ) - 1;
 329     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 330
 331 }
 332
 333 /*----------------------------------------------------------------------------
 334 | Returns the fraction bits of the double-precision floating-point value `a'.
 335 *----------------------------------------------------------------------------*/
 336
 337 INLINE bits64 extractFloat64Frac( float64 a )
 338 {
 339
 340     return a & LIT64( 0x000FFFFFFFFFFFFF );
 341
 342 }
 343
 344 /*----------------------------------------------------------------------------
 345 | Returns the exponent bits of the double-precision floating-point value `a'.
 346 *----------------------------------------------------------------------------*/
 347
 348 INLINE int16 extractFloat64Exp( float64 a )
 349 {
 350
 351     return ( a>>52 ) & 0x7FF;
 352
 353 }
 354
 355 /*----------------------------------------------------------------------------
 356 | Returns the sign bit of the double-precision floating-point value `a'.
 357 *----------------------------------------------------------------------------*/
 358
 359 INLINE flag extractFloat64Sign( float64 a )
 360 {
 361
 362     return a>>63;
 363
 364 }
 365
 366 /*----------------------------------------------------------------------------
 367 | Normalizes the subnormal double-precision floating-point value represented
 368 | by the denormalized significand `aSig'.  The normalized exponent and
 369 | significand are stored at the locations pointed to by `zExpPtr' and
 370 | `zSigPtr', respectively.
 371 *----------------------------------------------------------------------------*/
 372
 373 static void
 374  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 375 {
 376     int8 shiftCount;
 377
 378     shiftCount = countLeadingZeros64( aSig ) - 11;
 379     *zSigPtr = aSig<<shiftCount;
 380     *zExpPtr = 1 - shiftCount;
 381
 382 }
 383
 384 /*----------------------------------------------------------------------------
 385 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 386 | double-precision floating-point value, returning the result.  After being
 387 | shifted into the proper positions, the three fields are simply added
 388 | together to form the result.  This means that any integer portion of `zSig'
 389 | will be added into the exponent.  Since a properly normalized significand
 390 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 391 | than the desired result exponent whenever `zSig' is a complete, normalized
 392 | significand.
 393 *----------------------------------------------------------------------------*/
 394
 395 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 396 {
 397
 398     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 399
 400 }
 401
 402 /*----------------------------------------------------------------------------
 403 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 404 | and significand `zSig', and returns the proper double-precision floating-
 405 | point value corresponding to the abstract input.  Ordinarily, the abstract
 406 | value is simply rounded and packed into the double-precision format, with
 407 | the inexact exception raised if the abstract input cannot be represented
 408 | exactly.  However, if the abstract value is too large, the overflow and
 409 | inexact exceptions are raised and an infinity or maximal finite value is
 410 | returned.  If the abstract value is too small, the input value is rounded
 411 | to a subnormal number, and the underflow and inexact exceptions are raised
 412 | if the abstract input cannot be represented exactly as a subnormal double-
 413 | precision floating-point number.
 414 |     The input significand `zSig' has its binary point between bits 62
 415 | and 61, which is 10 bits to the left of the usual location.  This shifted
 416 | significand must be normalized or smaller.  If `zSig' is not normalized,
 417 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 418 | and it must not require rounding.  In the usual case that `zSig' is
 419 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 420 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 421 | Binary Floating-Point Arithmetic.
 422 *----------------------------------------------------------------------------*/
 423
 424 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 425 {
 426     int8 roundingMode;
 427     flag roundNearestEven;
 428     int16 roundIncrement, roundBits;
 429     flag isTiny;
 430
 431     roundingMode = STATUS(float_rounding_mode);
 432     roundNearestEven = ( roundingMode == float_round_nearest_even );
 433     roundIncrement = 0x200;
 434     if ( ! roundNearestEven ) {
 435         if ( roundingMode == float_round_to_zero ) {
 436             roundIncrement = 0;
 437         }
 438         else {
 439             roundIncrement = 0x3FF;
 440             if ( zSign ) {
 441                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 442             }
 443             else {
 444                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 445             }
 446         }
 447     }
 448     roundBits = zSig & 0x3FF;
 449     if ( 0x7FD <= (bits16) zExp ) {
 450         if (    ( 0x7FD < zExp )
 451              || (    ( zExp == 0x7FD )
 452                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 453            ) {
 454             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 455             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 456         }
 457         if ( zExp < 0 ) {
 458             isTiny =
 459                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 460                 || ( zExp < -1 )
 461                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 462             shift64RightJamming( zSig, - zExp, &zSig );
 463             zExp = 0;
 464             roundBits = zSig & 0x3FF;
 465             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 466         }
 467     }
 468     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 469     zSig = ( zSig + roundIncrement )>>10;
 470     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 471     if ( zSig == 0 ) zExp = 0;
 472     return packFloat64( zSign, zExp, zSig );
 473
 474 }
 475
 476 /*----------------------------------------------------------------------------
 477 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 478 | and significand `zSig', and returns the proper double-precision floating-
 479 | point value corresponding to the abstract input.  This routine is just like
 480 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 481 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 482 | floating-point exponent.
 483 *----------------------------------------------------------------------------*/
 484
 485 static float64
 486  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 487 {
 488     int8 shiftCount;
 489
 490     shiftCount = countLeadingZeros64( zSig ) - 1;
 491     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 492
 493 }
 494
 495 #ifdef FLOATX80
 496
 497 /*----------------------------------------------------------------------------
 498 | Returns the fraction bits of the extended double-precision floating-point
 499 | value `a'.
 500 *----------------------------------------------------------------------------*/
 501
 502 INLINE bits64 extractFloatx80Frac( floatx80 a )
 503 {
 504
 505     return a.low;
 506
 507 }
 508
 509 /*----------------------------------------------------------------------------
 510 | Returns the exponent bits of the extended double-precision floating-point
 511 | value `a'.
 512 *----------------------------------------------------------------------------*/
 513
 514 INLINE int32 extractFloatx80Exp( floatx80 a )
 515 {
 516
 517     return a.high & 0x7FFF;
 518
 519 }
 520
 521 /*----------------------------------------------------------------------------
 522 | Returns the sign bit of the extended double-precision floating-point value
 523 | `a'.
 524 *----------------------------------------------------------------------------*/
 525
 526 INLINE flag extractFloatx80Sign( floatx80 a )
 527 {
 528
 529     return a.high>>15;
 530
 531 }
 532
 533 /*----------------------------------------------------------------------------
 534 | Normalizes the subnormal extended double-precision floating-point value
 535 | represented by the denormalized significand `aSig'.  The normalized exponent
 536 | and significand are stored at the locations pointed to by `zExpPtr' and
 537 | `zSigPtr', respectively.
 538 *----------------------------------------------------------------------------*/
 539
 540 static void
 541  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 542 {
 543     int8 shiftCount;
 544
 545     shiftCount = countLeadingZeros64( aSig );
 546     *zSigPtr = aSig<<shiftCount;
 547     *zExpPtr = 1 - shiftCount;
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 553 | extended double-precision floating-point value, returning the result.
 554 *----------------------------------------------------------------------------*/
 555
 556 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 557 {
 558     floatx80 z;
 559
 560     z.low = zSig;
 561     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 562     return z;
 563
 564 }
 565
 566 /*----------------------------------------------------------------------------
 567 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 568 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 569 | and returns the proper extended double-precision floating-point value
 570 | corresponding to the abstract input.  Ordinarily, the abstract value is
 571 | rounded and packed into the extended double-precision format, with the
 572 | inexact exception raised if the abstract input cannot be represented
 573 | exactly.  However, if the abstract value is too large, the overflow and
 574 | inexact exceptions are raised and an infinity or maximal finite value is
 575 | returned.  If the abstract value is too small, the input value is rounded to
 576 | a subnormal number, and the underflow and inexact exceptions are raised if
 577 | the abstract input cannot be represented exactly as a subnormal extended
 578 | double-precision floating-point number.
 579 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 580 | number of bits as single or double precision, respectively.  Otherwise, the
 581 | result is rounded to the full precision of the extended double-precision
 582 | format.
 583 |     The input significand must be normalized or smaller.  If the input
 584 | significand is not normalized, `zExp' must be 0; in that case, the result
 585 | returned is a subnormal number, and it must not require rounding.  The
 586 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 587 | Floating-Point Arithmetic.
 588 *----------------------------------------------------------------------------*/
 589
 590 static floatx80
 591  roundAndPackFloatx80(
 592      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 593  STATUS_PARAM)
 594 {
 595     int8 roundingMode;
 596     flag roundNearestEven, increment, isTiny;
 597     int64 roundIncrement, roundMask, roundBits;
 598
 599     roundingMode = STATUS(float_rounding_mode);
 600     roundNearestEven = ( roundingMode == float_round_nearest_even );
 601     if ( roundingPrecision == 80 ) goto precision80;
 602     if ( roundingPrecision == 64 ) {
 603         roundIncrement = LIT64( 0x0000000000000400 );
 604         roundMask = LIT64( 0x00000000000007FF );
 605     }
 606     else if ( roundingPrecision == 32 ) {
 607         roundIncrement = LIT64( 0x0000008000000000 );
 608         roundMask = LIT64( 0x000000FFFFFFFFFF );
 609     }
 610     else {
 611         goto precision80;
 612     }
 613     zSig0 |= ( zSig1 != 0 );
 614     if ( ! roundNearestEven ) {
 615         if ( roundingMode == float_round_to_zero ) {
 616             roundIncrement = 0;
 617         }
 618         else {
 619             roundIncrement = roundMask;
 620             if ( zSign ) {
 621                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 622             }
 623             else {
 624                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 625             }
 626         }
 627     }
 628     roundBits = zSig0 & roundMask;
 629     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 630         if (    ( 0x7FFE < zExp )
 631              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 632            ) {
 633             goto overflow;
 634         }
 635         if ( zExp <= 0 ) {
 636             isTiny =
 637                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 638                 || ( zExp < 0 )
 639                 || ( zSig0 <= zSig0 + roundIncrement );
 640             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 641             zExp = 0;
 642             roundBits = zSig0 & roundMask;
 643             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 644             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 645             zSig0 += roundIncrement;
 646             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 647             roundIncrement = roundMask + 1;
 648             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 649                 roundMask |= roundIncrement;
 650             }
 651             zSig0 &= ~ roundMask;
 652             return packFloatx80( zSign, zExp, zSig0 );
 653         }
 654     }
 655     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 656     zSig0 += roundIncrement;
 657     if ( zSig0 < roundIncrement ) {
 658         ++zExp;
 659         zSig0 = LIT64( 0x8000000000000000 );
 660     }
 661     roundIncrement = roundMask + 1;
 662     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 663         roundMask |= roundIncrement;
 664     }
 665     zSig0 &= ~ roundMask;
 666     if ( zSig0 == 0 ) zExp = 0;
 667     return packFloatx80( zSign, zExp, zSig0 );
 668  precision80:
 669     increment = ( (sbits64) zSig1 < 0 );
 670     if ( ! roundNearestEven ) {
 671         if ( roundingMode == float_round_to_zero ) {
 672             increment = 0;
 673         }
 674         else {
 675             if ( zSign ) {
 676                 increment = ( roundingMode == float_round_down ) && zSig1;
 677             }
 678             else {
 679                 increment = ( roundingMode == float_round_up ) && zSig1;
 680             }
 681         }
 682     }
 683     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 684         if (    ( 0x7FFE < zExp )
 685              || (    ( zExp == 0x7FFE )
 686                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 687                   && increment
 688                 )
 689            ) {
 690             roundMask = 0;
 691  overflow:
 692             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 693             if (    ( roundingMode == float_round_to_zero )
 694                  || ( zSign && ( roundingMode == float_round_up ) )
 695                  || ( ! zSign && ( roundingMode == float_round_down ) )
 696                ) {
 697                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 698             }
 699             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 700         }
 701         if ( zExp <= 0 ) {
 702             isTiny =
 703                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 704                 || ( zExp < 0 )
 705                 || ! increment
 706                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 707             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 708             zExp = 0;
 709             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 710             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 711             if ( roundNearestEven ) {
 712                 increment = ( (sbits64) zSig1 < 0 );
 713             }
 714             else {
 715                 if ( zSign ) {
 716                     increment = ( roundingMode == float_round_down ) && zSig1;
 717                 }
 718                 else {
 719                     increment = ( roundingMode == float_round_up ) && zSig1;
 720                 }
 721             }
 722             if ( increment ) {
 723                 ++zSig0;
 724                 zSig0 &=
 725                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 726                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 727             }
 728             return packFloatx80( zSign, zExp, zSig0 );
 729         }
 730     }
 731     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 732     if ( increment ) {
 733         ++zSig0;
 734         if ( zSig0 == 0 ) {
 735             ++zExp;
 736             zSig0 = LIT64( 0x8000000000000000 );
 737         }
 738         else {
 739             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 740         }
 741     }
 742     else {
 743         if ( zSig0 == 0 ) zExp = 0;
 744     }
 745     return packFloatx80( zSign, zExp, zSig0 );
 746
 747 }
 748
 749 /*----------------------------------------------------------------------------
 750 | Takes an abstract floating-point value having sign `zSign', exponent
 751 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 752 | and returns the proper extended double-precision floating-point value
 753 | corresponding to the abstract input.  This routine is just like
 754 | `roundAndPackFloatx80' except that the input significand does not have to be
 755 | normalized.
 756 *----------------------------------------------------------------------------*/
 757
 758 static floatx80
 759  normalizeRoundAndPackFloatx80(
 760      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 761  STATUS_PARAM)
 762 {
 763     int8 shiftCount;
 764
 765     if ( zSig0 == 0 ) {
 766         zSig0 = zSig1;
 767         zSig1 = 0;
 768         zExp -= 64;
 769     }
 770     shiftCount = countLeadingZeros64( zSig0 );
 771     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 772     zExp -= shiftCount;
 773     return
 774         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 775
 776 }
 777
 778 #endif
 779
 780 #ifdef FLOAT128
 781
 782 /*----------------------------------------------------------------------------
 783 | Returns the least-significant 64 fraction bits of the quadruple-precision
 784 | floating-point value `a'.
 785 *----------------------------------------------------------------------------*/
 786
 787 INLINE bits64 extractFloat128Frac1( float128 a )
 788 {
 789
 790     return a.low;
 791
 792 }
 793
 794 /*----------------------------------------------------------------------------
 795 | Returns the most-significant 48 fraction bits of the quadruple-precision
 796 | floating-point value `a'.
 797 *----------------------------------------------------------------------------*/
 798
 799 INLINE bits64 extractFloat128Frac0( float128 a )
 800 {
 801
 802     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 803
 804 }
 805
 806 /*----------------------------------------------------------------------------
 807 | Returns the exponent bits of the quadruple-precision floating-point value
 808 | `a'.
 809 *----------------------------------------------------------------------------*/
 810
 811 INLINE int32 extractFloat128Exp( float128 a )
 812 {
 813
 814     return ( a.high>>48 ) & 0x7FFF;
 815
 816 }
 817
 818 /*----------------------------------------------------------------------------
 819 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 820 *----------------------------------------------------------------------------*/
 821
 822 INLINE flag extractFloat128Sign( float128 a )
 823 {
 824
 825     return a.high>>63;
 826
 827 }
 828
 829 /*----------------------------------------------------------------------------
 830 | Normalizes the subnormal quadruple-precision floating-point value
 831 | represented by the denormalized significand formed by the concatenation of
 832 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 833 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 834 | significand are stored at the location pointed to by `zSig0Ptr', and the
 835 | least significant 64 bits of the normalized significand are stored at the
 836 | location pointed to by `zSig1Ptr'.
 837 *----------------------------------------------------------------------------*/
 838
 839 static void
 840  normalizeFloat128Subnormal(
 841      bits64 aSig0,
 842      bits64 aSig1,
 843      int32 *zExpPtr,
 844      bits64 *zSig0Ptr,
 845      bits64 *zSig1Ptr
 846  )
 847 {
 848     int8 shiftCount;
 849
 850     if ( aSig0 == 0 ) {
 851         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 852         if ( shiftCount < 0 ) {
 853             *zSig0Ptr = aSig1>>( - shiftCount );
 854             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 855         }
 856         else {
 857             *zSig0Ptr = aSig1<<shiftCount;
 858             *zSig1Ptr = 0;
 859         }
 860         *zExpPtr = - shiftCount - 63;
 861     }
 862     else {
 863         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 864         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 865         *zExpPtr = 1 - shiftCount;
 866     }
 867
 868 }
 869
 870 /*----------------------------------------------------------------------------
 871 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 872 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 873 | floating-point value, returning the result.  After being shifted into the
 874 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 875 | added together to form the most significant 32 bits of the result.  This
 876 | means that any integer portion of `zSig0' will be added into the exponent.
 877 | Since a properly normalized significand will have an integer portion equal
 878 | to 1, the `zExp' input should be 1 less than the desired result exponent
 879 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 880 | significand.
 881 *----------------------------------------------------------------------------*/
 882
 883 INLINE float128
 884  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 885 {
 886     float128 z;
 887
 888     z.low = zSig1;
 889     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 890     return z;
 891
 892 }
 893
 894 /*----------------------------------------------------------------------------
 895 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 896 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 897 | and `zSig2', and returns the proper quadruple-precision floating-point value
 898 | corresponding to the abstract input.  Ordinarily, the abstract value is
 899 | simply rounded and packed into the quadruple-precision format, with the
 900 | inexact exception raised if the abstract input cannot be represented
 901 | exactly.  However, if the abstract value is too large, the overflow and
 902 | inexact exceptions are raised and an infinity or maximal finite value is
 903 | returned.  If the abstract value is too small, the input value is rounded to
 904 | a subnormal number, and the underflow and inexact exceptions are raised if
 905 | the abstract input cannot be represented exactly as a subnormal quadruple-
 906 | precision floating-point number.
 907 |     The input significand must be normalized or smaller.  If the input
 908 | significand is not normalized, `zExp' must be 0; in that case, the result
 909 | returned is a subnormal number, and it must not require rounding.  In the
 910 | usual case that the input significand is normalized, `zExp' must be 1 less
 911 | than the ``true'' floating-point exponent.  The handling of underflow and
 912 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 913 *----------------------------------------------------------------------------*/
 914
 915 static float128
 916  roundAndPackFloat128(
 917      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 918 {
 919     int8 roundingMode;
 920     flag roundNearestEven, increment, isTiny;
 921
 922     roundingMode = STATUS(float_rounding_mode);
 923     roundNearestEven = ( roundingMode == float_round_nearest_even );
 924     increment = ( (sbits64) zSig2 < 0 );
 925     if ( ! roundNearestEven ) {
 926         if ( roundingMode == float_round_to_zero ) {
 927             increment = 0;
 928         }
 929         else {
 930             if ( zSign ) {
 931                 increment = ( roundingMode == float_round_down ) && zSig2;
 932             }
 933             else {
 934                 increment = ( roundingMode == float_round_up ) && zSig2;
 935             }
 936         }
 937     }
 938     if ( 0x7FFD <= (bits32) zExp ) {
 939         if (    ( 0x7FFD < zExp )
 940              || (    ( zExp == 0x7FFD )
 941                   && eq128(
 942                          LIT64( 0x0001FFFFFFFFFFFF ),
 943                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 944                          zSig0,
 945                          zSig1
 946                      )
 947                   && increment
 948                 )
 949            ) {
 950             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 951             if (    ( roundingMode == float_round_to_zero )
 952                  || ( zSign && ( roundingMode == float_round_up ) )
 953                  || ( ! zSign && ( roundingMode == float_round_down ) )
 954                ) {
 955                 return
 956                     packFloat128(
 957                         zSign,
 958                         0x7FFE,
 959                         LIT64( 0x0000FFFFFFFFFFFF ),
 960                         LIT64( 0xFFFFFFFFFFFFFFFF )
 961                     );
 962             }
 963             return packFloat128( zSign, 0x7FFF, 0, 0 );
 964         }
 965         if ( zExp < 0 ) {
 966             isTiny =
 967                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 968                 || ( zExp < -1 )
 969                 || ! increment
 970                 || lt128(
 971                        zSig0,
 972                        zSig1,
 973                        LIT64( 0x0001FFFFFFFFFFFF ),
 974                        LIT64( 0xFFFFFFFFFFFFFFFF )
 975                    );
 976             shift128ExtraRightJamming(
 977                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 978             zExp = 0;
 979             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 980             if ( roundNearestEven ) {
 981                 increment = ( (sbits64) zSig2 < 0 );
 982             }
 983             else {
 984                 if ( zSign ) {
 985                     increment = ( roundingMode == float_round_down ) && zSig2;
 986                 }
 987                 else {
 988                     increment = ( roundingMode == float_round_up ) && zSig2;
 989                 }
 990             }
 991         }
 992     }
 993     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
 994     if ( increment ) {
 995         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 996         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 997     }
 998     else {
 999         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000     }
1001     return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003 }
1004
1005 /*----------------------------------------------------------------------------
1006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1008 | returns the proper quadruple-precision floating-point value corresponding
1009 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1010 | except that the input significand has fewer bits and does not have to be
1011 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012 | point exponent.
1013 *----------------------------------------------------------------------------*/
1014
1015 static float128
1016  normalizeRoundAndPackFloat128(
1017      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018 {
1019     int8 shiftCount;
1020     bits64 zSig2;
1021
1022     if ( zSig0 == 0 ) {
1023         zSig0 = zSig1;
1024         zSig1 = 0;
1025         zExp -= 64;
1026     }
1027     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028     if ( 0 <= shiftCount ) {
1029         zSig2 = 0;
1030         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031     }
1032     else {
1033         shift128ExtraRightJamming(
1034             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035     }
1036     zExp -= shiftCount;
1037     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039 }
1040
1041 #endif
1042
1043 /*----------------------------------------------------------------------------
1044 | Returns the result of converting the 32-bit two's complement integer `a'
1045 | to the single-precision floating-point format.  The conversion is performed
1046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047 *----------------------------------------------------------------------------*/
1048
1049 float32 int32_to_float32( int32 a STATUS_PARAM )
1050 {
1051     flag zSign;
1052
1053     if ( a == 0 ) return 0;
1054     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055     zSign = ( a < 0 );
1056     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058 }
1059
1060 /*----------------------------------------------------------------------------
1061 | Returns the result of converting the 32-bit two's complement integer `a'
1062 | to the double-precision floating-point format.  The conversion is performed
1063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064 *----------------------------------------------------------------------------*/
1065
1066 float64 int32_to_float64( int32 a STATUS_PARAM )
1067 {
1068     flag zSign;
1069     uint32 absA;
1070     int8 shiftCount;
1071     bits64 zSig;
1072
1073     if ( a == 0 ) return 0;
1074     zSign = ( a < 0 );
1075     absA = zSign ? - a : a;
1076     shiftCount = countLeadingZeros32( absA ) + 21;
1077     zSig = absA;
1078     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080 }
1081
1082 #ifdef FLOATX80
1083
1084 /*----------------------------------------------------------------------------
1085 | Returns the result of converting the 32-bit two's complement integer `a'
1086 | to the extended double-precision floating-point format.  The conversion
1087 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088 | Arithmetic.
1089 *----------------------------------------------------------------------------*/
1090
1091 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092 {
1093     flag zSign;
1094     uint32 absA;
1095     int8 shiftCount;
1096     bits64 zSig;
1097
1098     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099     zSign = ( a < 0 );
1100     absA = zSign ? - a : a;
1101     shiftCount = countLeadingZeros32( absA ) + 32;
1102     zSig = absA;
1103     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105 }
1106
1107 #endif
1108
1109 #ifdef FLOAT128
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a' to
1113 | the quadruple-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float128 int32_to_float128( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120     uint32 absA;
1121     int8 shiftCount;
1122     bits64 zSig0;
1123
1124     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125     zSign = ( a < 0 );
1126     absA = zSign ? - a : a;
1127     shiftCount = countLeadingZeros32( absA ) + 17;
1128     zSig0 = absA;
1129     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131 }
1132
1133 #endif
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 64-bit two's complement integer `a'
1137 | to the single-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float32 int64_to_float32( int64 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint64 absA;
1145     int8 shiftCount;
1146
1147     if ( a == 0 ) return 0;
1148     zSign = ( a < 0 );
1149     absA = zSign ? - a : a;
1150     shiftCount = countLeadingZeros64( absA ) - 40;
1151     if ( 0 <= shiftCount ) {
1152         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153     }
1154     else {
1155         shiftCount += 7;
1156         if ( shiftCount < 0 ) {
1157             shift64RightJamming( absA, - shiftCount, &absA );
1158         }
1159         else {
1160             absA <<= shiftCount;
1161         }
1162         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163     }
1164
1165 }
1166
1167 /*----------------------------------------------------------------------------
1168 | Returns the result of converting the 64-bit two's complement integer `a'
1169 | to the double-precision floating-point format.  The conversion is performed
1170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171 *----------------------------------------------------------------------------*/
1172
1173 float64 int64_to_float64( int64 a STATUS_PARAM )
1174 {
1175     flag zSign;
1176
1177     if ( a == 0 ) return 0;
1178     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179         return packFloat64( 1, 0x43E, 0 );
1180     }
1181     zSign = ( a < 0 );
1182     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184 }
1185
1186 #ifdef FLOATX80
1187
1188 /*----------------------------------------------------------------------------
1189 | Returns the result of converting the 64-bit two's complement integer `a'
1190 | to the extended double-precision floating-point format.  The conversion
1191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192 | Arithmetic.
1193 *----------------------------------------------------------------------------*/
1194
1195 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196 {
1197     flag zSign;
1198     uint64 absA;
1199     int8 shiftCount;
1200
1201     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202     zSign = ( a < 0 );
1203     absA = zSign ? - a : a;
1204     shiftCount = countLeadingZeros64( absA );
1205     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207 }
1208
1209 #endif
1210
1211 #ifdef FLOAT128
1212
1213 /*----------------------------------------------------------------------------
1214 | Returns the result of converting the 64-bit two's complement integer `a' to
1215 | the quadruple-precision floating-point format.  The conversion is performed
1216 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217 *----------------------------------------------------------------------------*/
1218
1219 float128 int64_to_float128( int64 a STATUS_PARAM )
1220 {
1221     flag zSign;
1222     uint64 absA;
1223     int8 shiftCount;
1224     int32 zExp;
1225     bits64 zSig0, zSig1;
1226
1227     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros64( absA ) + 49;
1231     zExp = 0x406E - shiftCount;
1232     if ( 64 <= shiftCount ) {
1233         zSig1 = 0;
1234         zSig0 = absA;
1235         shiftCount -= 64;
1236     }
1237     else {
1238         zSig1 = absA;
1239         zSig0 = 0;
1240     }
1241     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242     return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244 }
1245
1246 #endif
1247
1248 /*----------------------------------------------------------------------------
1249 | Returns the result of converting the single-precision floating-point value
1250 | `a' to the 32-bit two's complement integer format.  The conversion is
1251 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1252 | Arithmetic---which means in particular that the conversion is rounded
1253 | according to the current rounding mode.  If `a' is a NaN, the largest
1254 | positive integer is returned.  Otherwise, if the conversion overflows, the
1255 | largest integer with the same sign as `a' is returned.
1256 *----------------------------------------------------------------------------*/
1257
1258 int32 float32_to_int32( float32 a STATUS_PARAM )
1259 {
1260     flag aSign;
1261     int16 aExp, shiftCount;
1262     bits32 aSig;
1263     bits64 aSig64;
1264
1265     aSig = extractFloat32Frac( a );
1266     aExp = extractFloat32Exp( a );
1267     aSign = extractFloat32Sign( a );
1268     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269     if ( aExp ) aSig |= 0x00800000;
1270     shiftCount = 0xAF - aExp;
1271     aSig64 = aSig;
1272     aSig64 <<= 32;
1273     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276 }
1277
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the single-precision floating-point value
1280 | `a' to the 32-bit two's complement integer format.  The conversion is
1281 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic, except that the conversion is always rounded toward zero.
1283 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1284 | the conversion overflows, the largest integer with the same sign as `a' is
1285 | returned.
1286 *----------------------------------------------------------------------------*/
1287
1288 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289 {
1290     flag aSign;
1291     int16 aExp, shiftCount;
1292     bits32 aSig;
1293     int32 z;
1294
1295     aSig = extractFloat32Frac( a );
1296     aExp = extractFloat32Exp( a );
1297     aSign = extractFloat32Sign( a );
1298     shiftCount = aExp - 0x9E;
1299     if ( 0 <= shiftCount ) {
1300         if ( a != 0xCF000000 ) {
1301             float_raise( float_flag_invalid STATUS_VAR);
1302             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303         }
1304         return (sbits32) 0x80000000;
1305     }
1306     else if ( aExp <= 0x7E ) {
1307         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308         return 0;
1309     }
1310     aSig = ( aSig | 0x00800000 )<<8;
1311     z = aSig>>( - shiftCount );
1312     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313         STATUS(float_exception_flags) |= float_flag_inexact;
1314     }
1315     if ( aSign ) z = - z;
1316     return z;
1317
1318 }
1319
1320 /*----------------------------------------------------------------------------
1321 | Returns the result of converting the single-precision floating-point value
1322 | `a' to the 64-bit two's complement integer format.  The conversion is
1323 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1324 | Arithmetic---which means in particular that the conversion is rounded
1325 | according to the current rounding mode.  If `a' is a NaN, the largest
1326 | positive integer is returned.  Otherwise, if the conversion overflows, the
1327 | largest integer with the same sign as `a' is returned.
1328 *----------------------------------------------------------------------------*/
1329
1330 int64 float32_to_int64( float32 a STATUS_PARAM )
1331 {
1332     flag aSign;
1333     int16 aExp, shiftCount;
1334     bits32 aSig;
1335     bits64 aSig64, aSigExtra;
1336
1337     aSig = extractFloat32Frac( a );
1338     aExp = extractFloat32Exp( a );
1339     aSign = extractFloat32Sign( a );
1340     shiftCount = 0xBE - aExp;
1341     if ( shiftCount < 0 ) {
1342         float_raise( float_flag_invalid STATUS_VAR);
1343         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344             return LIT64( 0x7FFFFFFFFFFFFFFF );
1345         }
1346         return (sbits64) LIT64( 0x8000000000000000 );
1347     }
1348     if ( aExp ) aSig |= 0x00800000;
1349     aSig64 = aSig;
1350     aSig64 <<= 40;
1351     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic, except that the conversion is always rounded toward zero.  If
1361 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1362 | conversion overflows, the largest integer with the same sign as `a' is
1363 | returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64;
1372     int64 z;
1373
1374     aSig = extractFloat32Frac( a );
1375     aExp = extractFloat32Exp( a );
1376     aSign = extractFloat32Sign( a );
1377     shiftCount = aExp - 0xBE;
1378     if ( 0 <= shiftCount ) {
1379         if ( a != 0xDF000000 ) {
1380             float_raise( float_flag_invalid STATUS_VAR);
1381             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1383             }
1384         }
1385         return (sbits64) LIT64( 0x8000000000000000 );
1386     }
1387     else if ( aExp <= 0x7E ) {
1388         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389         return 0;
1390     }
1391     aSig64 = aSig | 0x00800000;
1392     aSig64 <<= 40;
1393     z = aSig64>>( - shiftCount );
1394     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395         STATUS(float_exception_flags) |= float_flag_inexact;
1396     }
1397     if ( aSign ) z = - z;
1398     return z;
1399
1400 }
1401
1402 /*----------------------------------------------------------------------------
1403 | Returns the result of converting the single-precision floating-point value
1404 | `a' to the double-precision floating-point format.  The conversion is
1405 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 | Arithmetic.
1407 *----------------------------------------------------------------------------*/
1408
1409 float64 float32_to_float64( float32 a STATUS_PARAM )
1410 {
1411     flag aSign;
1412     int16 aExp;
1413     bits32 aSig;
1414
1415     aSig = extractFloat32Frac( a );
1416     aExp = extractFloat32Exp( a );
1417     aSign = extractFloat32Sign( a );
1418     if ( aExp == 0xFF ) {
1419         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420         return packFloat64( aSign, 0x7FF, 0 );
1421     }
1422     if ( aExp == 0 ) {
1423         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425         --aExp;
1426     }
1427     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429 }
1430
1431 #ifdef FLOATX80
1432
1433 /*----------------------------------------------------------------------------
1434 | Returns the result of converting the single-precision floating-point value
1435 | `a' to the extended double-precision floating-point format.  The conversion
1436 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 | Arithmetic.
1438 *----------------------------------------------------------------------------*/
1439
1440 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441 {
1442     flag aSign;
1443     int16 aExp;
1444     bits32 aSig;
1445
1446     aSig = extractFloat32Frac( a );
1447     aExp = extractFloat32Exp( a );
1448     aSign = extractFloat32Sign( a );
1449     if ( aExp == 0xFF ) {
1450         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452     }
1453     if ( aExp == 0 ) {
1454         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456     }
1457     aSig |= 0x00800000;
1458     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460 }
1461
1462 #endif
1463
1464 #ifdef FLOAT128
1465
1466 /*----------------------------------------------------------------------------
1467 | Returns the result of converting the single-precision floating-point value
1468 | `a' to the double-precision floating-point format.  The conversion is
1469 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1470 | Arithmetic.
1471 *----------------------------------------------------------------------------*/
1472
1473 float128 float32_to_float128( float32 a STATUS_PARAM )
1474 {
1475     flag aSign;
1476     int16 aExp;
1477     bits32 aSig;
1478
1479     aSig = extractFloat32Frac( a );
1480     aExp = extractFloat32Exp( a );
1481     aSign = extractFloat32Sign( a );
1482     if ( aExp == 0xFF ) {
1483         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484         return packFloat128( aSign, 0x7FFF, 0, 0 );
1485     }
1486     if ( aExp == 0 ) {
1487         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489         --aExp;
1490     }
1491     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493 }
1494
1495 #endif
1496
1497 /*----------------------------------------------------------------------------
1498 | Rounds the single-precision floating-point value `a' to an integer, and
1499 | returns the result as a single-precision floating-point value.  The
1500 | operation is performed according to the IEC/IEEE Standard for Binary
1501 | Floating-Point Arithmetic.
1502 *----------------------------------------------------------------------------*/
1503
1504 float32 float32_round_to_int( float32 a STATUS_PARAM)
1505 {
1506     flag aSign;
1507     int16 aExp;
1508     bits32 lastBitMask, roundBitsMask;
1509     int8 roundingMode;
1510     float32 z;
1511
1512     aExp = extractFloat32Exp( a );
1513     if ( 0x96 <= aExp ) {
1514         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515             return propagateFloat32NaN( a, a STATUS_VAR );
1516         }
1517         return a;
1518     }
1519     if ( aExp <= 0x7E ) {
1520         if ( (bits32) ( a<<1 ) == 0 ) return a;
1521         STATUS(float_exception_flags) |= float_flag_inexact;
1522         aSign = extractFloat32Sign( a );
1523         switch ( STATUS(float_rounding_mode) ) {
1524          case float_round_nearest_even:
1525             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526                 return packFloat32( aSign, 0x7F, 0 );
1527             }
1528             break;
1529          case float_round_down:
1530             return aSign ? 0xBF800000 : 0;
1531          case float_round_up:
1532             return aSign ? 0x80000000 : 0x3F800000;
1533         }
1534         return packFloat32( aSign, 0, 0 );
1535     }
1536     lastBitMask = 1;
1537     lastBitMask <<= 0x96 - aExp;
1538     roundBitsMask = lastBitMask - 1;
1539     z = a;
1540     roundingMode = STATUS(float_rounding_mode);
1541     if ( roundingMode == float_round_nearest_even ) {
1542         z += lastBitMask>>1;
1543         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544     }
1545     else if ( roundingMode != float_round_to_zero ) {
1546         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547             z += roundBitsMask;
1548         }
1549     }
1550     z &= ~ roundBitsMask;
1551     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552     return z;
1553
1554 }
1555
1556 /*----------------------------------------------------------------------------
1557 | Returns the result of adding the absolute values of the single-precision
1558 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1559 | before being returned.  `zSign' is ignored if the result is a NaN.
1560 | The addition is performed according to the IEC/IEEE Standard for Binary
1561 | Floating-Point Arithmetic.
1562 *----------------------------------------------------------------------------*/
1563
1564 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565 {
1566     int16 aExp, bExp, zExp;
1567     bits32 aSig, bSig, zSig;
1568     int16 expDiff;
1569
1570     aSig = extractFloat32Frac( a );
1571     aExp = extractFloat32Exp( a );
1572     bSig = extractFloat32Frac( b );
1573     bExp = extractFloat32Exp( b );
1574     expDiff = aExp - bExp;
1575     aSig <<= 6;
1576     bSig <<= 6;
1577     if ( 0 < expDiff ) {
1578         if ( aExp == 0xFF ) {
1579             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580             return a;
1581         }
1582         if ( bExp == 0 ) {
1583             --expDiff;
1584         }
1585         else {
1586             bSig |= 0x20000000;
1587         }
1588         shift32RightJamming( bSig, expDiff, &bSig );
1589         zExp = aExp;
1590     }
1591     else if ( expDiff < 0 ) {
1592         if ( bExp == 0xFF ) {
1593             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594             return packFloat32( zSign, 0xFF, 0 );
1595         }
1596         if ( aExp == 0 ) {
1597             ++expDiff;
1598         }
1599         else {
1600             aSig |= 0x20000000;
1601         }
1602         shift32RightJamming( aSig, - expDiff, &aSig );
1603         zExp = bExp;
1604     }
1605     else {
1606         if ( aExp == 0xFF ) {
1607             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608             return a;
1609         }
1610         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611         zSig = 0x40000000 + aSig + bSig;
1612         zExp = aExp;
1613         goto roundAndPack;
1614     }
1615     aSig |= 0x20000000;
1616     zSig = ( aSig + bSig )<<1;
1617     --zExp;
1618     if ( (sbits32) zSig < 0 ) {
1619         zSig = aSig + bSig;
1620         ++zExp;
1621     }
1622  roundAndPack:
1623     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625 }
1626
1627 /*----------------------------------------------------------------------------
1628 | Returns the result of subtracting the absolute values of the single-
1629 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1630 | difference is negated before being returned.  `zSign' is ignored if the
1631 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1632 | Standard for Binary Floating-Point Arithmetic.
1633 *----------------------------------------------------------------------------*/
1634
1635 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636 {
1637     int16 aExp, bExp, zExp;
1638     bits32 aSig, bSig, zSig;
1639     int16 expDiff;
1640
1641     aSig = extractFloat32Frac( a );
1642     aExp = extractFloat32Exp( a );
1643     bSig = extractFloat32Frac( b );
1644     bExp = extractFloat32Exp( b );
1645     expDiff = aExp - bExp;
1646     aSig <<= 7;
1647     bSig <<= 7;
1648     if ( 0 < expDiff ) goto aExpBigger;
1649     if ( expDiff < 0 ) goto bExpBigger;
1650     if ( aExp == 0xFF ) {
1651         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652         float_raise( float_flag_invalid STATUS_VAR);
1653         return float32_default_nan;
1654     }
1655     if ( aExp == 0 ) {
1656         aExp = 1;
1657         bExp = 1;
1658     }
1659     if ( bSig < aSig ) goto aBigger;
1660     if ( aSig < bSig ) goto bBigger;
1661     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662  bExpBigger:
1663     if ( bExp == 0xFF ) {
1664         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665         return packFloat32( zSign ^ 1, 0xFF, 0 );
1666     }
1667     if ( aExp == 0 ) {
1668         ++expDiff;
1669     }
1670     else {
1671         aSig |= 0x40000000;
1672     }
1673     shift32RightJamming( aSig, - expDiff, &aSig );
1674     bSig |= 0x40000000;
1675  bBigger:
1676     zSig = bSig - aSig;
1677     zExp = bExp;
1678     zSign ^= 1;
1679     goto normalizeRoundAndPack;
1680  aExpBigger:
1681     if ( aExp == 0xFF ) {
1682         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683         return a;
1684     }
1685     if ( bExp == 0 ) {
1686         --expDiff;
1687     }
1688     else {
1689         bSig |= 0x40000000;
1690     }
1691     shift32RightJamming( bSig, expDiff, &bSig );
1692     aSig |= 0x40000000;
1693  aBigger:
1694     zSig = aSig - bSig;
1695     zExp = aExp;
1696  normalizeRoundAndPack:
1697     --zExp;
1698     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700 }
1701
1702 /*----------------------------------------------------------------------------
1703 | Returns the result of adding the single-precision floating-point values `a'
1704 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1705 | Binary Floating-Point Arithmetic.
1706 *----------------------------------------------------------------------------*/
1707
1708 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709 {
1710     flag aSign, bSign;
1711
1712     aSign = extractFloat32Sign( a );
1713     bSign = extractFloat32Sign( b );
1714     if ( aSign == bSign ) {
1715         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716     }
1717     else {
1718         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719     }
1720
1721 }
1722
1723 /*----------------------------------------------------------------------------
1724 | Returns the result of subtracting the single-precision floating-point values
1725 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1726 | for Binary Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728
1729 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730 {
1731     flag aSign, bSign;
1732
1733     aSign = extractFloat32Sign( a );
1734     bSign = extractFloat32Sign( b );
1735     if ( aSign == bSign ) {
1736         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737     }
1738     else {
1739         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740     }
1741
1742 }
1743
1744 /*----------------------------------------------------------------------------
1745 | Returns the result of multiplying the single-precision floating-point values
1746 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1747 | for Binary Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1749
1750 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751 {
1752     flag aSign, bSign, zSign;
1753     int16 aExp, bExp, zExp;
1754     bits32 aSig, bSig;
1755     bits64 zSig64;
1756     bits32 zSig;
1757
1758     aSig = extractFloat32Frac( a );
1759     aExp = extractFloat32Exp( a );
1760     aSign = extractFloat32Sign( a );
1761     bSig = extractFloat32Frac( b );
1762     bExp = extractFloat32Exp( b );
1763     bSign = extractFloat32Sign( b );
1764     zSign = aSign ^ bSign;
1765     if ( aExp == 0xFF ) {
1766         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767             return propagateFloat32NaN( a, b STATUS_VAR );
1768         }
1769         if ( ( bExp | bSig ) == 0 ) {
1770             float_raise( float_flag_invalid STATUS_VAR);
1771             return float32_default_nan;
1772         }
1773         return packFloat32( zSign, 0xFF, 0 );
1774     }
1775     if ( bExp == 0xFF ) {
1776         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777         if ( ( aExp | aSig ) == 0 ) {
1778             float_raise( float_flag_invalid STATUS_VAR);
1779             return float32_default_nan;
1780         }
1781         return packFloat32( zSign, 0xFF, 0 );
1782     }
1783     if ( aExp == 0 ) {
1784         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786     }
1787     if ( bExp == 0 ) {
1788         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790     }
1791     zExp = aExp + bExp - 0x7F;
1792     aSig = ( aSig | 0x00800000 )<<7;
1793     bSig = ( bSig | 0x00800000 )<<8;
1794     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795     zSig = zSig64;
1796     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797         zSig <<= 1;
1798         --zExp;
1799     }
1800     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802 }
1803
1804 /*----------------------------------------------------------------------------
1805 | Returns the result of dividing the single-precision floating-point value `a'
1806 | by the corresponding value `b'.  The operation is performed according to the
1807 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808 *----------------------------------------------------------------------------*/
1809
1810 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811 {
1812     flag aSign, bSign, zSign;
1813     int16 aExp, bExp, zExp;
1814     bits32 aSig, bSig, zSig;
1815
1816     aSig = extractFloat32Frac( a );
1817     aExp = extractFloat32Exp( a );
1818     aSign = extractFloat32Sign( a );
1819     bSig = extractFloat32Frac( b );
1820     bExp = extractFloat32Exp( b );
1821     bSign = extractFloat32Sign( b );
1822     zSign = aSign ^ bSign;
1823     if ( aExp == 0xFF ) {
1824         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825         if ( bExp == 0xFF ) {
1826             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827             float_raise( float_flag_invalid STATUS_VAR);
1828             return float32_default_nan;
1829         }
1830         return packFloat32( zSign, 0xFF, 0 );
1831     }
1832     if ( bExp == 0xFF ) {
1833         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834         return packFloat32( zSign, 0, 0 );
1835     }
1836     if ( bExp == 0 ) {
1837         if ( bSig == 0 ) {
1838             if ( ( aExp | aSig ) == 0 ) {
1839                 float_raise( float_flag_invalid STATUS_VAR);
1840                 return float32_default_nan;
1841             }
1842             float_raise( float_flag_divbyzero STATUS_VAR);
1843             return packFloat32( zSign, 0xFF, 0 );
1844         }
1845         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846     }
1847     if ( aExp == 0 ) {
1848         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850     }
1851     zExp = aExp - bExp + 0x7D;
1852     aSig = ( aSig | 0x00800000 )<<7;
1853     bSig = ( bSig | 0x00800000 )<<8;
1854     if ( bSig <= ( aSig + aSig ) ) {
1855         aSig >>= 1;
1856         ++zExp;
1857     }
1858     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859     if ( ( zSig & 0x3F ) == 0 ) {
1860         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861     }
1862     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864 }
1865
1866 /*----------------------------------------------------------------------------
1867 | Returns the remainder of the single-precision floating-point value `a'
1868 | with respect to the corresponding value `b'.  The operation is performed
1869 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870 *----------------------------------------------------------------------------*/
1871
1872 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873 {
1874     flag aSign, bSign, zSign;
1875     int16 aExp, bExp, expDiff;
1876     bits32 aSig, bSig;
1877     bits32 q;
1878     bits64 aSig64, bSig64, q64;
1879     bits32 alternateASig;
1880     sbits32 sigMean;
1881
1882     aSig = extractFloat32Frac( a );
1883     aExp = extractFloat32Exp( a );
1884     aSign = extractFloat32Sign( a );
1885     bSig = extractFloat32Frac( b );
1886     bExp = extractFloat32Exp( b );
1887     bSign = extractFloat32Sign( b );
1888     if ( aExp == 0xFF ) {
1889         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890             return propagateFloat32NaN( a, b STATUS_VAR );
1891         }
1892         float_raise( float_flag_invalid STATUS_VAR);
1893         return float32_default_nan;
1894     }
1895     if ( bExp == 0xFF ) {
1896         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897         return a;
1898     }
1899     if ( bExp == 0 ) {
1900         if ( bSig == 0 ) {
1901             float_raise( float_flag_invalid STATUS_VAR);
1902             return float32_default_nan;
1903         }
1904         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905     }
1906     if ( aExp == 0 ) {
1907         if ( aSig == 0 ) return a;
1908         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909     }
1910     expDiff = aExp - bExp;
1911     aSig |= 0x00800000;
1912     bSig |= 0x00800000;
1913     if ( expDiff < 32 ) {
1914         aSig <<= 8;
1915         bSig <<= 8;
1916         if ( expDiff < 0 ) {
1917             if ( expDiff < -1 ) return a;
1918             aSig >>= 1;
1919         }
1920         q = ( bSig <= aSig );
1921         if ( q ) aSig -= bSig;
1922         if ( 0 < expDiff ) {
1923             q = ( ( (bits64) aSig )<<32 ) / bSig;
1924             q >>= 32 - expDiff;
1925             bSig >>= 2;
1926             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927         }
1928         else {
1929             aSig >>= 2;
1930             bSig >>= 2;
1931         }
1932     }
1933     else {
1934         if ( bSig <= aSig ) aSig -= bSig;
1935         aSig64 = ( (bits64) aSig )<<40;
1936         bSig64 = ( (bits64) bSig )<<40;
1937         expDiff -= 64;
1938         while ( 0 < expDiff ) {
1939             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941             aSig64 = - ( ( bSig * q64 )<<38 );
1942             expDiff -= 62;
1943         }
1944         expDiff += 64;
1945         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947         q = q64>>( 64 - expDiff );
1948         bSig <<= 6;
1949         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950     }
1951     do {
1952         alternateASig = aSig;
1953         ++q;
1954         aSig -= bSig;
1955     } while ( 0 <= (sbits32) aSig );
1956     sigMean = aSig + alternateASig;
1957     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958         aSig = alternateASig;
1959     }
1960     zSign = ( (sbits32) aSig < 0 );
1961     if ( zSign ) aSig = - aSig;
1962     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964 }
1965
1966 /*----------------------------------------------------------------------------
1967 | Returns the square root of the single-precision floating-point value `a'.
1968 | The operation is performed according to the IEC/IEEE Standard for Binary
1969 | Floating-Point Arithmetic.
1970 *----------------------------------------------------------------------------*/
1971
1972 float32 float32_sqrt( float32 a STATUS_PARAM )
1973 {
1974     flag aSign;
1975     int16 aExp, zExp;
1976     bits32 aSig, zSig;
1977     bits64 rem, term;
1978
1979     aSig = extractFloat32Frac( a );
1980     aExp = extractFloat32Exp( a );
1981     aSign = extractFloat32Sign( a );
1982     if ( aExp == 0xFF ) {
1983         if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984         if ( ! aSign ) return a;
1985         float_raise( float_flag_invalid STATUS_VAR);
1986         return float32_default_nan;
1987     }
1988     if ( aSign ) {
1989         if ( ( aExp | aSig ) == 0 ) return a;
1990         float_raise( float_flag_invalid STATUS_VAR);
1991         return float32_default_nan;
1992     }
1993     if ( aExp == 0 ) {
1994         if ( aSig == 0 ) return 0;
1995         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996     }
1997     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998     aSig = ( aSig | 0x00800000 )<<8;
1999     zSig = estimateSqrt32( aExp, aSig ) + 2;
2000     if ( ( zSig & 0x7F ) <= 5 ) {
2001         if ( zSig < 2 ) {
2002             zSig = 0x7FFFFFFF;
2003             goto roundAndPack;
2004         }
2005         aSig >>= aExp & 1;
2006         term = ( (bits64) zSig ) * zSig;
2007         rem = ( ( (bits64) aSig )<<32 ) - term;
2008         while ( (sbits64) rem < 0 ) {
2009             --zSig;
2010             rem += ( ( (bits64) zSig )<<1 ) | 1;
2011         }
2012         zSig |= ( rem != 0 );
2013     }
2014     shift32RightJamming( zSig, 1, &zSig );
2015  roundAndPack:
2016     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018 }
2019
2020 /*----------------------------------------------------------------------------
2021 | Returns 1 if the single-precision floating-point value `a' is equal to
2022 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2023 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024 *----------------------------------------------------------------------------*/
2025
2026 flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027 {
2028
2029     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031        ) {
2032         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033             float_raise( float_flag_invalid STATUS_VAR);
2034         }
2035         return 0;
2036     }
2037     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038
2039 }
2040
2041 /*----------------------------------------------------------------------------
2042 | Returns 1 if the single-precision floating-point value `a' is less than
2043 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2044 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045 | Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047
2048 flag float32_le( float32 a, float32 b STATUS_PARAM )
2049 {
2050     flag aSign, bSign;
2051
2052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054        ) {
2055         float_raise( float_flag_invalid STATUS_VAR);
2056         return 0;
2057     }
2058     aSign = extractFloat32Sign( a );
2059     bSign = extractFloat32Sign( b );
2060     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061     return ( a == b ) || ( aSign ^ ( a < b ) );
2062
2063 }
2064
2065 /*----------------------------------------------------------------------------
2066 | Returns 1 if the single-precision floating-point value `a' is less than
2067 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2068 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069 *----------------------------------------------------------------------------*/
2070
2071 flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072 {
2073     flag aSign, bSign;
2074
2075     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077        ) {
2078         float_raise( float_flag_invalid STATUS_VAR);
2079         return 0;
2080     }
2081     aSign = extractFloat32Sign( a );
2082     bSign = extractFloat32Sign( b );
2083     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084     return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086 }
2087
2088 /*----------------------------------------------------------------------------
2089 | Returns 1 if the single-precision floating-point value `a' is equal to
2090 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2091 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2092 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093 *----------------------------------------------------------------------------*/
2094
2095 flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096 {
2097
2098     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100        ) {
2101         float_raise( float_flag_invalid STATUS_VAR);
2102         return 0;
2103     }
2104     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105
2106 }
2107
2108 /*----------------------------------------------------------------------------
2109 | Returns 1 if the single-precision floating-point value `a' is less than or
2110 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2111 | cause an exception.  Otherwise, the comparison is performed according to the
2112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113 *----------------------------------------------------------------------------*/
2114
2115 flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116 {
2117     flag aSign, bSign;
2118
2119     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121        ) {
2122         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123             float_raise( float_flag_invalid STATUS_VAR);
2124         }
2125         return 0;
2126     }
2127     aSign = extractFloat32Sign( a );
2128     bSign = extractFloat32Sign( b );
2129     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130     return ( a == b ) || ( aSign ^ ( a < b ) );
2131
2132 }
2133
2134 /*----------------------------------------------------------------------------
2135 | Returns 1 if the single-precision floating-point value `a' is less than
2136 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2137 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2138 | Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
2141 flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142 {
2143     flag aSign, bSign;
2144
2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147        ) {
2148         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149             float_raise( float_flag_invalid STATUS_VAR);
2150         }
2151         return 0;
2152     }
2153     aSign = extractFloat32Sign( a );
2154     bSign = extractFloat32Sign( b );
2155     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156     return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158 }
2159
2160 /*----------------------------------------------------------------------------
2161 | Returns the result of converting the double-precision floating-point value
2162 | `a' to the 32-bit two's complement integer format.  The conversion is
2163 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2164 | Arithmetic---which means in particular that the conversion is rounded
2165 | according to the current rounding mode.  If `a' is a NaN, the largest
2166 | positive integer is returned.  Otherwise, if the conversion overflows, the
2167 | largest integer with the same sign as `a' is returned.
2168 *----------------------------------------------------------------------------*/
2169
2170 int32 float64_to_int32( float64 a STATUS_PARAM )
2171 {
2172     flag aSign;
2173     int16 aExp, shiftCount;
2174     bits64 aSig;
2175
2176     aSig = extractFloat64Frac( a );
2177     aExp = extractFloat64Exp( a );
2178     aSign = extractFloat64Sign( a );
2179     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181     shiftCount = 0x42C - aExp;
2182     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the result of converting the double-precision floating-point value
2189 | `a' to the 32-bit two's complement integer format.  The conversion is
2190 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2191 | Arithmetic, except that the conversion is always rounded toward zero.
2192 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2193 | the conversion overflows, the largest integer with the same sign as `a' is
2194 | returned.
2195 *----------------------------------------------------------------------------*/
2196
2197 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198 {
2199     flag aSign;
2200     int16 aExp, shiftCount;
2201     bits64 aSig, savedASig;
2202     int32 z;
2203
2204     aSig = extractFloat64Frac( a );
2205     aExp = extractFloat64Exp( a );
2206     aSign = extractFloat64Sign( a );
2207     if ( 0x41E < aExp ) {
2208         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209         goto invalid;
2210     }
2211     else if ( aExp < 0x3FF ) {
2212         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213         return 0;
2214     }
2215     aSig |= LIT64( 0x0010000000000000 );
2216     shiftCount = 0x433 - aExp;
2217     savedASig = aSig;
2218     aSig >>= shiftCount;
2219     z = aSig;
2220     if ( aSign ) z = - z;
2221     if ( ( z < 0 ) ^ aSign ) {
2222  invalid:
2223         float_raise( float_flag_invalid STATUS_VAR);
2224         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225     }
2226     if ( ( aSig<<shiftCount ) != savedASig ) {
2227         STATUS(float_exception_flags) |= float_flag_inexact;
2228     }
2229     return z;
2230
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 64-bit two's complement integer format.  The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic---which means in particular that the conversion is rounded
2238 | according to the current rounding mode.  If `a' is a NaN, the largest
2239 | positive integer is returned.  Otherwise, if the conversion overflows, the
2240 | largest integer with the same sign as `a' is returned.
2241 *----------------------------------------------------------------------------*/
2242
2243 int64 float64_to_int64( float64 a STATUS_PARAM )
2244 {
2245     flag aSign;
2246     int16 aExp, shiftCount;
2247     bits64 aSig, aSigExtra;
2248
2249     aSig = extractFloat64Frac( a );
2250     aExp = extractFloat64Exp( a );
2251     aSign = extractFloat64Sign( a );
2252     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253     shiftCount = 0x433 - aExp;
2254     if ( shiftCount <= 0 ) {
2255         if ( 0x43E < aExp ) {
2256             float_raise( float_flag_invalid STATUS_VAR);
2257             if (    ! aSign
2258                  || (    ( aExp == 0x7FF )
2259                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260                ) {
2261                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2262             }
2263             return (sbits64) LIT64( 0x8000000000000000 );
2264         }
2265         aSigExtra = 0;
2266         aSig <<= - shiftCount;
2267     }
2268     else {
2269         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270     }
2271     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273 }
2274
2275 /*----------------------------------------------------------------------------
2276 | Returns the result of converting the double-precision floating-point value
2277 | `a' to the 64-bit two's complement integer format.  The conversion is
2278 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2279 | Arithmetic, except that the conversion is always rounded toward zero.
2280 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2281 | the conversion overflows, the largest integer with the same sign as `a' is
2282 | returned.
2283 *----------------------------------------------------------------------------*/
2284
2285 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286 {
2287     flag aSign;
2288     int16 aExp, shiftCount;
2289     bits64 aSig;
2290     int64 z;
2291
2292     aSig = extractFloat64Frac( a );
2293     aExp = extractFloat64Exp( a );
2294     aSign = extractFloat64Sign( a );
2295     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296     shiftCount = aExp - 0x433;
2297     if ( 0 <= shiftCount ) {
2298         if ( 0x43E <= aExp ) {
2299             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300                 float_raise( float_flag_invalid STATUS_VAR);
2301                 if (    ! aSign
2302                      || (    ( aExp == 0x7FF )
2303                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304                    ) {
2305                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2306                 }
2307             }
2308             return (sbits64) LIT64( 0x8000000000000000 );
2309         }
2310         z = aSig<<shiftCount;
2311     }
2312     else {
2313         if ( aExp < 0x3FE ) {
2314             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315             return 0;
2316         }
2317         z = aSig>>( - shiftCount );
2318         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319             STATUS(float_exception_flags) |= float_flag_inexact;
2320         }
2321     }
2322     if ( aSign ) z = - z;
2323     return z;
2324
2325 }
2326
2327 /*----------------------------------------------------------------------------
2328 | Returns the result of converting the double-precision floating-point value
2329 | `a' to the single-precision floating-point format.  The conversion is
2330 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 | Arithmetic.
2332 *----------------------------------------------------------------------------*/
2333
2334 float32 float64_to_float32( float64 a STATUS_PARAM )
2335 {
2336     flag aSign;
2337     int16 aExp;
2338     bits64 aSig;
2339     bits32 zSig;
2340
2341     aSig = extractFloat64Frac( a );
2342     aExp = extractFloat64Exp( a );
2343     aSign = extractFloat64Sign( a );
2344     if ( aExp == 0x7FF ) {
2345         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346         return packFloat32( aSign, 0xFF, 0 );
2347     }
2348     shift64RightJamming( aSig, 22, &aSig );
2349     zSig = aSig;
2350     if ( aExp || zSig ) {
2351         zSig |= 0x40000000;
2352         aExp -= 0x381;
2353     }
2354     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356 }
2357
2358 #ifdef FLOATX80
2359
2360 /*----------------------------------------------------------------------------
2361 | Returns the result of converting the double-precision floating-point value
2362 | `a' to the extended double-precision floating-point format.  The conversion
2363 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 | Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366
2367 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368 {
2369     flag aSign;
2370     int16 aExp;
2371     bits64 aSig;
2372
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( aExp == 0x7FF ) {
2377         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379     }
2380     if ( aExp == 0 ) {
2381         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383     }
2384     return
2385         packFloatx80(
2386             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387
2388 }
2389
2390 #endif
2391
2392 #ifdef FLOAT128
2393
2394 /*----------------------------------------------------------------------------
2395 | Returns the result of converting the double-precision floating-point value
2396 | `a' to the quadruple-precision floating-point format.  The conversion is
2397 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2398 | Arithmetic.
2399 *----------------------------------------------------------------------------*/
2400
2401 float128 float64_to_float128( float64 a STATUS_PARAM )
2402 {
2403     flag aSign;
2404     int16 aExp;
2405     bits64 aSig, zSig0, zSig1;
2406
2407     aSig = extractFloat64Frac( a );
2408     aExp = extractFloat64Exp( a );
2409     aSign = extractFloat64Sign( a );
2410     if ( aExp == 0x7FF ) {
2411         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412         return packFloat128( aSign, 0x7FFF, 0, 0 );
2413     }
2414     if ( aExp == 0 ) {
2415         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417         --aExp;
2418     }
2419     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422 }
2423
2424 #endif
2425
2426 /*----------------------------------------------------------------------------
2427 | Rounds the double-precision floating-point value `a' to an integer, and
2428 | returns the result as a double-precision floating-point value.  The
2429 | operation is performed according to the IEC/IEEE Standard for Binary
2430 | Floating-Point Arithmetic.
2431 *----------------------------------------------------------------------------*/
2432
2433 float64 float64_round_to_int( float64 a STATUS_PARAM )
2434 {
2435     flag aSign;
2436     int16 aExp;
2437     bits64 lastBitMask, roundBitsMask;
2438     int8 roundingMode;
2439     float64 z;
2440
2441     aExp = extractFloat64Exp( a );
2442     if ( 0x433 <= aExp ) {
2443         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444             return propagateFloat64NaN( a, a STATUS_VAR );
2445         }
2446         return a;
2447     }
2448     if ( aExp < 0x3FF ) {
2449         if ( (bits64) ( a<<1 ) == 0 ) return a;
2450         STATUS(float_exception_flags) |= float_flag_inexact;
2451         aSign = extractFloat64Sign( a );
2452         switch ( STATUS(float_rounding_mode) ) {
2453          case float_round_nearest_even:
2454             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455                 return packFloat64( aSign, 0x3FF, 0 );
2456             }
2457             break;
2458          case float_round_down:
2459             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460          case float_round_up:
2461             return
2462             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463         }
2464         return packFloat64( aSign, 0, 0 );
2465     }
2466     lastBitMask = 1;
2467     lastBitMask <<= 0x433 - aExp;
2468     roundBitsMask = lastBitMask - 1;
2469     z = a;
2470     roundingMode = STATUS(float_rounding_mode);
2471     if ( roundingMode == float_round_nearest_even ) {
2472         z += lastBitMask>>1;
2473         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474     }
2475     else if ( roundingMode != float_round_to_zero ) {
2476         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477             z += roundBitsMask;
2478         }
2479     }
2480     z &= ~ roundBitsMask;
2481     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482     return z;
2483
2484 }
2485
2486 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2487 {
2488     int oldmode;
2489     float64 res;
2490     oldmode = STATUS(float_rounding_mode);
2491     STATUS(float_rounding_mode) = float_round_to_zero;
2492     res = float64_round_to_int(a STATUS_VAR);
2493     STATUS(float_rounding_mode) = oldmode;
2494     return res;
2495 }
2496
2497 /*----------------------------------------------------------------------------
2498 | Returns the result of adding the absolute values of the double-precision
2499 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2500 | before being returned.  `zSign' is ignored if the result is a NaN.
2501 | The addition is performed according to the IEC/IEEE Standard for Binary
2502 | Floating-Point Arithmetic.
2503 *----------------------------------------------------------------------------*/
2504
2505 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2506 {
2507     int16 aExp, bExp, zExp;
2508     bits64 aSig, bSig, zSig;
2509     int16 expDiff;
2510
2511     aSig = extractFloat64Frac( a );
2512     aExp = extractFloat64Exp( a );
2513     bSig = extractFloat64Frac( b );
2514     bExp = extractFloat64Exp( b );
2515     expDiff = aExp - bExp;
2516     aSig <<= 9;
2517     bSig <<= 9;
2518     if ( 0 < expDiff ) {
2519         if ( aExp == 0x7FF ) {
2520             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2521             return a;
2522         }
2523         if ( bExp == 0 ) {
2524             --expDiff;
2525         }
2526         else {
2527             bSig |= LIT64( 0x2000000000000000 );
2528         }
2529         shift64RightJamming( bSig, expDiff, &bSig );
2530         zExp = aExp;
2531     }
2532     else if ( expDiff < 0 ) {
2533         if ( bExp == 0x7FF ) {
2534             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2535             return packFloat64( zSign, 0x7FF, 0 );
2536         }
2537         if ( aExp == 0 ) {
2538             ++expDiff;
2539         }
2540         else {
2541             aSig |= LIT64( 0x2000000000000000 );
2542         }
2543         shift64RightJamming( aSig, - expDiff, &aSig );
2544         zExp = bExp;
2545     }
2546     else {
2547         if ( aExp == 0x7FF ) {
2548             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549             return a;
2550         }
2551         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2552         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2553         zExp = aExp;
2554         goto roundAndPack;
2555     }
2556     aSig |= LIT64( 0x2000000000000000 );
2557     zSig = ( aSig + bSig )<<1;
2558     --zExp;
2559     if ( (sbits64) zSig < 0 ) {
2560         zSig = aSig + bSig;
2561         ++zExp;
2562     }
2563  roundAndPack:
2564     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2565
2566 }
2567
2568 /*----------------------------------------------------------------------------
2569 | Returns the result of subtracting the absolute values of the double-
2570 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2571 | difference is negated before being returned.  `zSign' is ignored if the
2572 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2573 | Standard for Binary Floating-Point Arithmetic.
2574 *----------------------------------------------------------------------------*/
2575
2576 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2577 {
2578     int16 aExp, bExp, zExp;
2579     bits64 aSig, bSig, zSig;
2580     int16 expDiff;
2581
2582     aSig = extractFloat64Frac( a );
2583     aExp = extractFloat64Exp( a );
2584     bSig = extractFloat64Frac( b );
2585     bExp = extractFloat64Exp( b );
2586     expDiff = aExp - bExp;
2587     aSig <<= 10;
2588     bSig <<= 10;
2589     if ( 0 < expDiff ) goto aExpBigger;
2590     if ( expDiff < 0 ) goto bExpBigger;
2591     if ( aExp == 0x7FF ) {
2592         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2593         float_raise( float_flag_invalid STATUS_VAR);
2594         return float64_default_nan;
2595     }
2596     if ( aExp == 0 ) {
2597         aExp = 1;
2598         bExp = 1;
2599     }
2600     if ( bSig < aSig ) goto aBigger;
2601     if ( aSig < bSig ) goto bBigger;
2602     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2603  bExpBigger:
2604     if ( bExp == 0x7FF ) {
2605         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2606         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2607     }
2608     if ( aExp == 0 ) {
2609         ++expDiff;
2610     }
2611     else {
2612         aSig |= LIT64( 0x4000000000000000 );
2613     }
2614     shift64RightJamming( aSig, - expDiff, &aSig );
2615     bSig |= LIT64( 0x4000000000000000 );
2616  bBigger:
2617     zSig = bSig - aSig;
2618     zExp = bExp;
2619     zSign ^= 1;
2620     goto normalizeRoundAndPack;
2621  aExpBigger:
2622     if ( aExp == 0x7FF ) {
2623         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624         return a;
2625     }
2626     if ( bExp == 0 ) {
2627         --expDiff;
2628     }
2629     else {
2630         bSig |= LIT64( 0x4000000000000000 );
2631     }
2632     shift64RightJamming( bSig, expDiff, &bSig );
2633     aSig |= LIT64( 0x4000000000000000 );
2634  aBigger:
2635     zSig = aSig - bSig;
2636     zExp = aExp;
2637  normalizeRoundAndPack:
2638     --zExp;
2639     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2640
2641 }
2642
2643 /*----------------------------------------------------------------------------
2644 | Returns the result of adding the double-precision floating-point values `a'
2645 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2646 | Binary Floating-Point Arithmetic.
2647 *----------------------------------------------------------------------------*/
2648
2649 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2650 {
2651     flag aSign, bSign;
2652
2653     aSign = extractFloat64Sign( a );
2654     bSign = extractFloat64Sign( b );
2655     if ( aSign == bSign ) {
2656         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2657     }
2658     else {
2659         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2660     }
2661
2662 }
2663
2664 /*----------------------------------------------------------------------------
2665 | Returns the result of subtracting the double-precision floating-point values
2666 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2667 | for Binary Floating-Point Arithmetic.
2668 *----------------------------------------------------------------------------*/
2669
2670 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2671 {
2672     flag aSign, bSign;
2673
2674     aSign = extractFloat64Sign( a );
2675     bSign = extractFloat64Sign( b );
2676     if ( aSign == bSign ) {
2677         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2678     }
2679     else {
2680         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2681     }
2682
2683 }
2684
2685 /*----------------------------------------------------------------------------
2686 | Returns the result of multiplying the double-precision floating-point values
2687 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2688 | for Binary Floating-Point Arithmetic.
2689 *----------------------------------------------------------------------------*/
2690
2691 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2692 {
2693     flag aSign, bSign, zSign;
2694     int16 aExp, bExp, zExp;
2695     bits64 aSig, bSig, zSig0, zSig1;
2696
2697     aSig = extractFloat64Frac( a );
2698     aExp = extractFloat64Exp( a );
2699     aSign = extractFloat64Sign( a );
2700     bSig = extractFloat64Frac( b );
2701     bExp = extractFloat64Exp( b );
2702     bSign = extractFloat64Sign( b );
2703     zSign = aSign ^ bSign;
2704     if ( aExp == 0x7FF ) {
2705         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2706             return propagateFloat64NaN( a, b STATUS_VAR );
2707         }
2708         if ( ( bExp | bSig ) == 0 ) {
2709             float_raise( float_flag_invalid STATUS_VAR);
2710             return float64_default_nan;
2711         }
2712         return packFloat64( zSign, 0x7FF, 0 );
2713     }
2714     if ( bExp == 0x7FF ) {
2715         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2716         if ( ( aExp | aSig ) == 0 ) {
2717             float_raise( float_flag_invalid STATUS_VAR);
2718             return float64_default_nan;
2719         }
2720         return packFloat64( zSign, 0x7FF, 0 );
2721     }
2722     if ( aExp == 0 ) {
2723         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2724         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2725     }
2726     if ( bExp == 0 ) {
2727         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2728         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2729     }
2730     zExp = aExp + bExp - 0x3FF;
2731     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2732     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2733     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2734     zSig0 |= ( zSig1 != 0 );
2735     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2736         zSig0 <<= 1;
2737         --zExp;
2738     }
2739     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2740
2741 }
2742
2743 /*----------------------------------------------------------------------------
2744 | Returns the result of dividing the double-precision floating-point value `a'
2745 | by the corresponding value `b'.  The operation is performed according to
2746 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2747 *----------------------------------------------------------------------------*/
2748
2749 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2750 {
2751     flag aSign, bSign, zSign;
2752     int16 aExp, bExp, zExp;
2753     bits64 aSig, bSig, zSig;
2754     bits64 rem0, rem1;
2755     bits64 term0, term1;
2756
2757     aSig = extractFloat64Frac( a );
2758     aExp = extractFloat64Exp( a );
2759     aSign = extractFloat64Sign( a );
2760     bSig = extractFloat64Frac( b );
2761     bExp = extractFloat64Exp( b );
2762     bSign = extractFloat64Sign( b );
2763     zSign = aSign ^ bSign;
2764     if ( aExp == 0x7FF ) {
2765         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2766         if ( bExp == 0x7FF ) {
2767             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2768             float_raise( float_flag_invalid STATUS_VAR);
2769             return float64_default_nan;
2770         }
2771         return packFloat64( zSign, 0x7FF, 0 );
2772     }
2773     if ( bExp == 0x7FF ) {
2774         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775         return packFloat64( zSign, 0, 0 );
2776     }
2777     if ( bExp == 0 ) {
2778         if ( bSig == 0 ) {
2779             if ( ( aExp | aSig ) == 0 ) {
2780                 float_raise( float_flag_invalid STATUS_VAR);
2781                 return float64_default_nan;
2782             }
2783             float_raise( float_flag_divbyzero STATUS_VAR);
2784             return packFloat64( zSign, 0x7FF, 0 );
2785         }
2786         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2787     }
2788     if ( aExp == 0 ) {
2789         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2790         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2791     }
2792     zExp = aExp - bExp + 0x3FD;
2793     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2794     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2795     if ( bSig <= ( aSig + aSig ) ) {
2796         aSig >>= 1;
2797         ++zExp;
2798     }
2799     zSig = estimateDiv128To64( aSig, 0, bSig );
2800     if ( ( zSig & 0x1FF ) <= 2 ) {
2801         mul64To128( bSig, zSig, &term0, &term1 );
2802         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2803         while ( (sbits64) rem0 < 0 ) {
2804             --zSig;
2805             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2806         }
2807         zSig |= ( rem1 != 0 );
2808     }
2809     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2810
2811 }
2812
2813 /*----------------------------------------------------------------------------
2814 | Returns the remainder of the double-precision floating-point value `a'
2815 | with respect to the corresponding value `b'.  The operation is performed
2816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2817 *----------------------------------------------------------------------------*/
2818
2819 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2820 {
2821     flag aSign, bSign, zSign;
2822     int16 aExp, bExp, expDiff;
2823     bits64 aSig, bSig;
2824     bits64 q, alternateASig;
2825     sbits64 sigMean;
2826
2827     aSig = extractFloat64Frac( a );
2828     aExp = extractFloat64Exp( a );
2829     aSign = extractFloat64Sign( a );
2830     bSig = extractFloat64Frac( b );
2831     bExp = extractFloat64Exp( b );
2832     bSign = extractFloat64Sign( b );
2833     if ( aExp == 0x7FF ) {
2834         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835             return propagateFloat64NaN( a, b STATUS_VAR );
2836         }
2837         float_raise( float_flag_invalid STATUS_VAR);
2838         return float64_default_nan;
2839     }
2840     if ( bExp == 0x7FF ) {
2841         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2842         return a;
2843     }
2844     if ( bExp == 0 ) {
2845         if ( bSig == 0 ) {
2846             float_raise( float_flag_invalid STATUS_VAR);
2847             return float64_default_nan;
2848         }
2849         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2850     }
2851     if ( aExp == 0 ) {
2852         if ( aSig == 0 ) return a;
2853         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854     }
2855     expDiff = aExp - bExp;
2856     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2857     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858     if ( expDiff < 0 ) {
2859         if ( expDiff < -1 ) return a;
2860         aSig >>= 1;
2861     }
2862     q = ( bSig <= aSig );
2863     if ( q ) aSig -= bSig;
2864     expDiff -= 64;
2865     while ( 0 < expDiff ) {
2866         q = estimateDiv128To64( aSig, 0, bSig );
2867         q = ( 2 < q ) ? q - 2 : 0;
2868         aSig = - ( ( bSig>>2 ) * q );
2869         expDiff -= 62;
2870     }
2871     expDiff += 64;
2872     if ( 0 < expDiff ) {
2873         q = estimateDiv128To64( aSig, 0, bSig );
2874         q = ( 2 < q ) ? q - 2 : 0;
2875         q >>= 64 - expDiff;
2876         bSig >>= 2;
2877         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2878     }
2879     else {
2880         aSig >>= 2;
2881         bSig >>= 2;
2882     }
2883     do {
2884         alternateASig = aSig;
2885         ++q;
2886         aSig -= bSig;
2887     } while ( 0 <= (sbits64) aSig );
2888     sigMean = aSig + alternateASig;
2889     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2890         aSig = alternateASig;
2891     }
2892     zSign = ( (sbits64) aSig < 0 );
2893     if ( zSign ) aSig = - aSig;
2894     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2895
2896 }
2897
2898 /*----------------------------------------------------------------------------
2899 | Returns the square root of the double-precision floating-point value `a'.
2900 | The operation is performed according to the IEC/IEEE Standard for Binary
2901 | Floating-Point Arithmetic.
2902 *----------------------------------------------------------------------------*/
2903
2904 float64 float64_sqrt( float64 a STATUS_PARAM )
2905 {
2906     flag aSign;
2907     int16 aExp, zExp;
2908     bits64 aSig, zSig, doubleZSig;
2909     bits64 rem0, rem1, term0, term1;
2910
2911     aSig = extractFloat64Frac( a );
2912     aExp = extractFloat64Exp( a );
2913     aSign = extractFloat64Sign( a );
2914     if ( aExp == 0x7FF ) {
2915         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2916         if ( ! aSign ) return a;
2917         float_raise( float_flag_invalid STATUS_VAR);
2918         return float64_default_nan;
2919     }
2920     if ( aSign ) {
2921         if ( ( aExp | aSig ) == 0 ) return a;
2922         float_raise( float_flag_invalid STATUS_VAR);
2923         return float64_default_nan;
2924     }
2925     if ( aExp == 0 ) {
2926         if ( aSig == 0 ) return 0;
2927         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2928     }
2929     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2930     aSig |= LIT64( 0x0010000000000000 );
2931     zSig = estimateSqrt32( aExp, aSig>>21 );
2932     aSig <<= 9 - ( aExp & 1 );
2933     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2934     if ( ( zSig & 0x1FF ) <= 5 ) {
2935         doubleZSig = zSig<<1;
2936         mul64To128( zSig, zSig, &term0, &term1 );
2937         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2938         while ( (sbits64) rem0 < 0 ) {
2939             --zSig;
2940             doubleZSig -= 2;
2941             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2942         }
2943         zSig |= ( ( rem0 | rem1 ) != 0 );
2944     }
2945     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2946
2947 }
2948
2949 /*----------------------------------------------------------------------------
2950 | Returns 1 if the double-precision floating-point value `a' is equal to the
2951 | corresponding value `b', and 0 otherwise.  The comparison is performed
2952 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2954
2955 flag float64_eq( float64 a, float64 b STATUS_PARAM )
2956 {
2957
2958     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2959          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2960        ) {
2961         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2962             float_raise( float_flag_invalid STATUS_VAR);
2963         }
2964         return 0;
2965     }
2966     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2967
2968 }
2969
2970 /*----------------------------------------------------------------------------
2971 | Returns 1 if the double-precision floating-point value `a' is less than or
2972 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2973 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2974 | Arithmetic.
2975 *----------------------------------------------------------------------------*/
2976
2977 flag float64_le( float64 a, float64 b STATUS_PARAM )
2978 {
2979     flag aSign, bSign;
2980
2981     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2982          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2983        ) {
2984         float_raise( float_flag_invalid STATUS_VAR);
2985         return 0;
2986     }
2987     aSign = extractFloat64Sign( a );
2988     bSign = extractFloat64Sign( b );
2989     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2990     return ( a == b ) || ( aSign ^ ( a < b ) );
2991
2992 }
2993
2994 /*----------------------------------------------------------------------------
2995 | Returns 1 if the double-precision floating-point value `a' is less than
2996 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2998 *----------------------------------------------------------------------------*/
2999
3000 flag float64_lt( float64 a, float64 b STATUS_PARAM )
3001 {
3002     flag aSign, bSign;
3003
3004     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3005          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3006        ) {
3007         float_raise( float_flag_invalid STATUS_VAR);
3008         return 0;
3009     }
3010     aSign = extractFloat64Sign( a );
3011     bSign = extractFloat64Sign( b );
3012     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3013     return ( a != b ) && ( aSign ^ ( a < b ) );
3014
3015 }
3016
3017 /*----------------------------------------------------------------------------
3018 | Returns 1 if the double-precision floating-point value `a' is equal to the
3019 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3020 | if either operand is a NaN.  Otherwise, the comparison is performed
3021 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3022 *----------------------------------------------------------------------------*/
3023
3024 flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3025 {
3026
3027     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3028          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3029        ) {
3030         float_raise( float_flag_invalid STATUS_VAR);
3031         return 0;
3032     }
3033     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3034
3035 }
3036
3037 /*----------------------------------------------------------------------------
3038 | Returns 1 if the double-precision floating-point value `a' is less than or
3039 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3040 | cause an exception.  Otherwise, the comparison is performed according to the
3041 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3042 *----------------------------------------------------------------------------*/
3043
3044 flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3045 {
3046     flag aSign, bSign;
3047
3048     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3049          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3050        ) {
3051         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3052             float_raise( float_flag_invalid STATUS_VAR);
3053         }
3054         return 0;
3055     }
3056     aSign = extractFloat64Sign( a );
3057     bSign = extractFloat64Sign( b );
3058     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3059     return ( a == b ) || ( aSign ^ ( a < b ) );
3060
3061 }
3062
3063 /*----------------------------------------------------------------------------
3064 | Returns 1 if the double-precision floating-point value `a' is less than
3065 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3066 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3067 | Standard for Binary Floating-Point Arithmetic.
3068 *----------------------------------------------------------------------------*/
3069
3070 flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3071 {
3072     flag aSign, bSign;
3073
3074     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3075          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3076        ) {
3077         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3078             float_raise( float_flag_invalid STATUS_VAR);
3079         }
3080         return 0;
3081     }
3082     aSign = extractFloat64Sign( a );
3083     bSign = extractFloat64Sign( b );
3084     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3085     return ( a != b ) && ( aSign ^ ( a < b ) );
3086
3087 }
3088
3089 #ifdef FLOATX80
3090
3091 /*----------------------------------------------------------------------------
3092 | Returns the result of converting the extended double-precision floating-
3093 | point value `a' to the 32-bit two's complement integer format.  The
3094 | conversion is performed according to the IEC/IEEE Standard for Binary
3095 | Floating-Point Arithmetic---which means in particular that the conversion
3096 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3097 | largest positive integer is returned.  Otherwise, if the conversion
3098 | overflows, the largest integer with the same sign as `a' is returned.
3099 *----------------------------------------------------------------------------*/
3100
3101 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3102 {
3103     flag aSign;
3104     int32 aExp, shiftCount;
3105     bits64 aSig;
3106
3107     aSig = extractFloatx80Frac( a );
3108     aExp = extractFloatx80Exp( a );
3109     aSign = extractFloatx80Sign( a );
3110     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3111     shiftCount = 0x4037 - aExp;
3112     if ( shiftCount <= 0 ) shiftCount = 1;
3113     shift64RightJamming( aSig, shiftCount, &aSig );
3114     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3115
3116 }
3117
3118 /*----------------------------------------------------------------------------
3119 | Returns the result of converting the extended double-precision floating-
3120 | point value `a' to the 32-bit two's complement integer format.  The
3121 | conversion is performed according to the IEC/IEEE Standard for Binary
3122 | Floating-Point Arithmetic, except that the conversion is always rounded
3123 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3124 | Otherwise, if the conversion overflows, the largest integer with the same
3125 | sign as `a' is returned.
3126 *----------------------------------------------------------------------------*/
3127
3128 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3129 {
3130     flag aSign;
3131     int32 aExp, shiftCount;
3132     bits64 aSig, savedASig;
3133     int32 z;
3134
3135     aSig = extractFloatx80Frac( a );
3136     aExp = extractFloatx80Exp( a );
3137     aSign = extractFloatx80Sign( a );
3138     if ( 0x401E < aExp ) {
3139         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3140         goto invalid;
3141     }
3142     else if ( aExp < 0x3FFF ) {
3143         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3144         return 0;
3145     }
3146     shiftCount = 0x403E - aExp;
3147     savedASig = aSig;
3148     aSig >>= shiftCount;
3149     z = aSig;
3150     if ( aSign ) z = - z;
3151     if ( ( z < 0 ) ^ aSign ) {
3152  invalid:
3153         float_raise( float_flag_invalid STATUS_VAR);
3154         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3155     }
3156     if ( ( aSig<<shiftCount ) != savedASig ) {
3157         STATUS(float_exception_flags) |= float_flag_inexact;
3158     }
3159     return z;
3160
3161 }
3162
3163 /*----------------------------------------------------------------------------
3164 | Returns the result of converting the extended double-precision floating-
3165 | point value `a' to the 64-bit two's complement integer format.  The
3166 | conversion is performed according to the IEC/IEEE Standard for Binary
3167 | Floating-Point Arithmetic---which means in particular that the conversion
3168 | is rounded according to the current rounding mode.  If `a' is a NaN,
3169 | the largest positive integer is returned.  Otherwise, if the conversion
3170 | overflows, the largest integer with the same sign as `a' is returned.
3171 *----------------------------------------------------------------------------*/
3172
3173 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3174 {
3175     flag aSign;
3176     int32 aExp, shiftCount;
3177     bits64 aSig, aSigExtra;
3178
3179     aSig = extractFloatx80Frac( a );
3180     aExp = extractFloatx80Exp( a );
3181     aSign = extractFloatx80Sign( a );
3182     shiftCount = 0x403E - aExp;
3183     if ( shiftCount <= 0 ) {
3184         if ( shiftCount ) {
3185             float_raise( float_flag_invalid STATUS_VAR);
3186             if (    ! aSign
3187                  || (    ( aExp == 0x7FFF )
3188                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3189                ) {
3190                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3191             }
3192             return (sbits64) LIT64( 0x8000000000000000 );
3193         }
3194         aSigExtra = 0;
3195     }
3196     else {
3197         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3198     }
3199     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3200
3201 }
3202
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the extended double-precision floating-
3205 | point value `a' to the 64-bit two's complement integer format.  The
3206 | conversion is performed according to the IEC/IEEE Standard for Binary
3207 | Floating-Point Arithmetic, except that the conversion is always rounded
3208 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3209 | Otherwise, if the conversion overflows, the largest integer with the same
3210 | sign as `a' is returned.
3211 *----------------------------------------------------------------------------*/
3212
3213 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3214 {
3215     flag aSign;
3216     int32 aExp, shiftCount;
3217     bits64 aSig;
3218     int64 z;
3219
3220     aSig = extractFloatx80Frac( a );
3221     aExp = extractFloatx80Exp( a );
3222     aSign = extractFloatx80Sign( a );
3223     shiftCount = aExp - 0x403E;
3224     if ( 0 <= shiftCount ) {
3225         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3226         if ( ( a.high != 0xC03E ) || aSig ) {
3227             float_raise( float_flag_invalid STATUS_VAR);
3228             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3229                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3230             }
3231         }
3232         return (sbits64) LIT64( 0x8000000000000000 );
3233     }
3234     else if ( aExp < 0x3FFF ) {
3235         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3236         return 0;
3237     }
3238     z = aSig>>( - shiftCount );
3239     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3240         STATUS(float_exception_flags) |= float_flag_inexact;
3241     }
3242     if ( aSign ) z = - z;
3243     return z;
3244
3245 }
3246
3247 /*----------------------------------------------------------------------------
3248 | Returns the result of converting the extended double-precision floating-
3249 | point value `a' to the single-precision floating-point format.  The
3250 | conversion is performed according to the IEC/IEEE Standard for Binary
3251 | Floating-Point Arithmetic.
3252 *----------------------------------------------------------------------------*/
3253
3254 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3255 {
3256     flag aSign;
3257     int32 aExp;
3258     bits64 aSig;
3259
3260     aSig = extractFloatx80Frac( a );
3261     aExp = extractFloatx80Exp( a );
3262     aSign = extractFloatx80Sign( a );
3263     if ( aExp == 0x7FFF ) {
3264         if ( (bits64) ( aSig<<1 ) ) {
3265             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3266         }
3267         return packFloat32( aSign, 0xFF, 0 );
3268     }
3269     shift64RightJamming( aSig, 33, &aSig );
3270     if ( aExp || aSig ) aExp -= 0x3F81;
3271     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3272
3273 }
3274
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of converting the extended double-precision floating-
3277 | point value `a' to the double-precision floating-point format.  The
3278 | conversion is performed according to the IEC/IEEE Standard for Binary
3279 | Floating-Point Arithmetic.
3280 *----------------------------------------------------------------------------*/
3281
3282 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3283 {
3284     flag aSign;
3285     int32 aExp;
3286     bits64 aSig, zSig;
3287
3288     aSig = extractFloatx80Frac( a );
3289     aExp = extractFloatx80Exp( a );
3290     aSign = extractFloatx80Sign( a );
3291     if ( aExp == 0x7FFF ) {
3292         if ( (bits64) ( aSig<<1 ) ) {
3293             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3294         }
3295         return packFloat64( aSign, 0x7FF, 0 );
3296     }
3297     shift64RightJamming( aSig, 1, &zSig );
3298     if ( aExp || aSig ) aExp -= 0x3C01;
3299     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3300
3301 }
3302
3303 #ifdef FLOAT128
3304
3305 /*----------------------------------------------------------------------------
3306 | Returns the result of converting the extended double-precision floating-
3307 | point value `a' to the quadruple-precision floating-point format.  The
3308 | conversion is performed according to the IEC/IEEE Standard for Binary
3309 | Floating-Point Arithmetic.
3310 *----------------------------------------------------------------------------*/
3311
3312 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3313 {
3314     flag aSign;
3315     int16 aExp;
3316     bits64 aSig, zSig0, zSig1;
3317
3318     aSig = extractFloatx80Frac( a );
3319     aExp = extractFloatx80Exp( a );
3320     aSign = extractFloatx80Sign( a );
3321     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3322         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3323     }
3324     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3325     return packFloat128( aSign, aExp, zSig0, zSig1 );
3326
3327 }
3328
3329 #endif
3330
3331 /*----------------------------------------------------------------------------
3332 | Rounds the extended double-precision floating-point value `a' to an integer,
3333 | and returns the result as an extended quadruple-precision floating-point
3334 | value.  The operation is performed according to the IEC/IEEE Standard for
3335 | Binary Floating-Point Arithmetic.
3336 *----------------------------------------------------------------------------*/
3337
3338 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3339 {
3340     flag aSign;
3341     int32 aExp;
3342     bits64 lastBitMask, roundBitsMask;
3343     int8 roundingMode;
3344     floatx80 z;
3345
3346     aExp = extractFloatx80Exp( a );
3347     if ( 0x403E <= aExp ) {
3348         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3349             return propagateFloatx80NaN( a, a STATUS_VAR );
3350         }
3351         return a;
3352     }
3353     if ( aExp < 0x3FFF ) {
3354         if (    ( aExp == 0 )
3355              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3356             return a;
3357         }
3358         STATUS(float_exception_flags) |= float_flag_inexact;
3359         aSign = extractFloatx80Sign( a );
3360         switch ( STATUS(float_rounding_mode) ) {
3361          case float_round_nearest_even:
3362             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3363                ) {
3364                 return
3365                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366             }
3367             break;
3368          case float_round_down:
3369             return
3370                   aSign ?
3371                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3372                 : packFloatx80( 0, 0, 0 );
3373          case float_round_up:
3374             return
3375                   aSign ? packFloatx80( 1, 0, 0 )
3376                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3377         }
3378         return packFloatx80( aSign, 0, 0 );
3379     }
3380     lastBitMask = 1;
3381     lastBitMask <<= 0x403E - aExp;
3382     roundBitsMask = lastBitMask - 1;
3383     z = a;
3384     roundingMode = STATUS(float_rounding_mode);
3385     if ( roundingMode == float_round_nearest_even ) {
3386         z.low += lastBitMask>>1;
3387         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3388     }
3389     else if ( roundingMode != float_round_to_zero ) {
3390         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3391             z.low += roundBitsMask;
3392         }
3393     }
3394     z.low &= ~ roundBitsMask;
3395     if ( z.low == 0 ) {
3396         ++z.high;
3397         z.low = LIT64( 0x8000000000000000 );
3398     }
3399     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3400     return z;
3401
3402 }
3403
3404 /*----------------------------------------------------------------------------
3405 | Returns the result of adding the absolute values of the extended double-
3406 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3407 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3408 | The addition is performed according to the IEC/IEEE Standard for Binary
3409 | Floating-Point Arithmetic.
3410 *----------------------------------------------------------------------------*/
3411
3412 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3413 {
3414     int32 aExp, bExp, zExp;
3415     bits64 aSig, bSig, zSig0, zSig1;
3416     int32 expDiff;
3417
3418     aSig = extractFloatx80Frac( a );
3419     aExp = extractFloatx80Exp( a );
3420     bSig = extractFloatx80Frac( b );
3421     bExp = extractFloatx80Exp( b );
3422     expDiff = aExp - bExp;
3423     if ( 0 < expDiff ) {
3424         if ( aExp == 0x7FFF ) {
3425             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3426             return a;
3427         }
3428         if ( bExp == 0 ) --expDiff;
3429         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3430         zExp = aExp;
3431     }
3432     else if ( expDiff < 0 ) {
3433         if ( bExp == 0x7FFF ) {
3434             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3435             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3436         }
3437         if ( aExp == 0 ) ++expDiff;
3438         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3439         zExp = bExp;
3440     }
3441     else {
3442         if ( aExp == 0x7FFF ) {
3443             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3444                 return propagateFloatx80NaN( a, b STATUS_VAR );
3445             }
3446             return a;
3447         }
3448         zSig1 = 0;
3449         zSig0 = aSig + bSig;
3450         if ( aExp == 0 ) {
3451             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3452             goto roundAndPack;
3453         }
3454         zExp = aExp;
3455         goto shiftRight1;
3456     }
3457     zSig0 = aSig + bSig;
3458     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3459  shiftRight1:
3460     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3461     zSig0 |= LIT64( 0x8000000000000000 );
3462     ++zExp;
3463  roundAndPack:
3464     return
3465         roundAndPackFloatx80(
3466             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3467
3468 }
3469
3470 /*----------------------------------------------------------------------------
3471 | Returns the result of subtracting the absolute values of the extended
3472 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3473 | difference is negated before being returned.  `zSign' is ignored if the
3474 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3475 | Standard for Binary Floating-Point Arithmetic.
3476 *----------------------------------------------------------------------------*/
3477
3478 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3479 {
3480     int32 aExp, bExp, zExp;
3481     bits64 aSig, bSig, zSig0, zSig1;
3482     int32 expDiff;
3483     floatx80 z;
3484
3485     aSig = extractFloatx80Frac( a );
3486     aExp = extractFloatx80Exp( a );
3487     bSig = extractFloatx80Frac( b );
3488     bExp = extractFloatx80Exp( b );
3489     expDiff = aExp - bExp;
3490     if ( 0 < expDiff ) goto aExpBigger;
3491     if ( expDiff < 0 ) goto bExpBigger;
3492     if ( aExp == 0x7FFF ) {
3493         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3494             return propagateFloatx80NaN( a, b STATUS_VAR );
3495         }
3496         float_raise( float_flag_invalid STATUS_VAR);
3497         z.low = floatx80_default_nan_low;
3498         z.high = floatx80_default_nan_high;
3499         return z;
3500     }
3501     if ( aExp == 0 ) {
3502         aExp = 1;
3503         bExp = 1;
3504     }
3505     zSig1 = 0;
3506     if ( bSig < aSig ) goto aBigger;
3507     if ( aSig < bSig ) goto bBigger;
3508     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3509  bExpBigger:
3510     if ( bExp == 0x7FFF ) {
3511         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3513     }
3514     if ( aExp == 0 ) ++expDiff;
3515     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516  bBigger:
3517     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3518     zExp = bExp;
3519     zSign ^= 1;
3520     goto normalizeRoundAndPack;
3521  aExpBigger:
3522     if ( aExp == 0x7FFF ) {
3523         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3524         return a;
3525     }
3526     if ( bExp == 0 ) --expDiff;
3527     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3528  aBigger:
3529     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3530     zExp = aExp;
3531  normalizeRoundAndPack:
3532     return
3533         normalizeRoundAndPackFloatx80(
3534             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3535
3536 }
3537
3538 /*----------------------------------------------------------------------------
3539 | Returns the result of adding the extended double-precision floating-point
3540 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3541 | Standard for Binary Floating-Point Arithmetic.
3542 *----------------------------------------------------------------------------*/
3543
3544 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3545 {
3546     flag aSign, bSign;
3547
3548     aSign = extractFloatx80Sign( a );
3549     bSign = extractFloatx80Sign( b );
3550     if ( aSign == bSign ) {
3551         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3552     }
3553     else {
3554         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3555     }
3556
3557 }
3558
3559 /*----------------------------------------------------------------------------
3560 | Returns the result of subtracting the extended double-precision floating-
3561 | point values `a' and `b'.  The operation is performed according to the
3562 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3563 *----------------------------------------------------------------------------*/
3564
3565 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3566 {
3567     flag aSign, bSign;
3568
3569     aSign = extractFloatx80Sign( a );
3570     bSign = extractFloatx80Sign( b );
3571     if ( aSign == bSign ) {
3572         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3573     }
3574     else {
3575         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3576     }
3577
3578 }
3579
3580 /*----------------------------------------------------------------------------
3581 | Returns the result of multiplying the extended double-precision floating-
3582 | point values `a' and `b'.  The operation is performed according to the
3583 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3584 *----------------------------------------------------------------------------*/
3585
3586 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3587 {
3588     flag aSign, bSign, zSign;
3589     int32 aExp, bExp, zExp;
3590     bits64 aSig, bSig, zSig0, zSig1;
3591     floatx80 z;
3592
3593     aSig = extractFloatx80Frac( a );
3594     aExp = extractFloatx80Exp( a );
3595     aSign = extractFloatx80Sign( a );
3596     bSig = extractFloatx80Frac( b );
3597     bExp = extractFloatx80Exp( b );
3598     bSign = extractFloatx80Sign( b );
3599     zSign = aSign ^ bSign;
3600     if ( aExp == 0x7FFF ) {
3601         if (    (bits64) ( aSig<<1 )
3602              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3603             return propagateFloatx80NaN( a, b STATUS_VAR );
3604         }
3605         if ( ( bExp | bSig ) == 0 ) goto invalid;
3606         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607     }
3608     if ( bExp == 0x7FFF ) {
3609         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3610         if ( ( aExp | aSig ) == 0 ) {
3611  invalid:
3612             float_raise( float_flag_invalid STATUS_VAR);
3613             z.low = floatx80_default_nan_low;
3614             z.high = floatx80_default_nan_high;
3615             return z;
3616         }
3617         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3618     }
3619     if ( aExp == 0 ) {
3620         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3621         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3622     }
3623     if ( bExp == 0 ) {
3624         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3625         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3626     }
3627     zExp = aExp + bExp - 0x3FFE;
3628     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3629     if ( 0 < (sbits64) zSig0 ) {
3630         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631         --zExp;
3632     }
3633     return
3634         roundAndPackFloatx80(
3635             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3636
3637 }
3638
3639 /*----------------------------------------------------------------------------
3640 | Returns the result of dividing the extended double-precision floating-point
3641 | value `a' by the corresponding value `b'.  The operation is performed
3642 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3643 *----------------------------------------------------------------------------*/
3644
3645 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3646 {
3647     flag aSign, bSign, zSign;
3648     int32 aExp, bExp, zExp;
3649     bits64 aSig, bSig, zSig0, zSig1;
3650     bits64 rem0, rem1, rem2, term0, term1, term2;
3651     floatx80 z;
3652
3653     aSig = extractFloatx80Frac( a );
3654     aExp = extractFloatx80Exp( a );
3655     aSign = extractFloatx80Sign( a );
3656     bSig = extractFloatx80Frac( b );
3657     bExp = extractFloatx80Exp( b );
3658     bSign = extractFloatx80Sign( b );
3659     zSign = aSign ^ bSign;
3660     if ( aExp == 0x7FFF ) {
3661         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3662         if ( bExp == 0x7FFF ) {
3663             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3664             goto invalid;
3665         }
3666         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3667     }
3668     if ( bExp == 0x7FFF ) {
3669         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3670         return packFloatx80( zSign, 0, 0 );
3671     }
3672     if ( bExp == 0 ) {
3673         if ( bSig == 0 ) {
3674             if ( ( aExp | aSig ) == 0 ) {
3675  invalid:
3676                 float_raise( float_flag_invalid STATUS_VAR);
3677                 z.low = floatx80_default_nan_low;
3678                 z.high = floatx80_default_nan_high;
3679                 return z;
3680             }
3681             float_raise( float_flag_divbyzero STATUS_VAR);
3682             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683         }
3684         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3685     }
3686     if ( aExp == 0 ) {
3687         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3688         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3689     }
3690     zExp = aExp - bExp + 0x3FFE;
3691     rem1 = 0;
3692     if ( bSig <= aSig ) {
3693         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3694         ++zExp;
3695     }
3696     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3697     mul64To128( bSig, zSig0, &term0, &term1 );
3698     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3699     while ( (sbits64) rem0 < 0 ) {
3700         --zSig0;
3701         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3702     }
3703     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3704     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3705         mul64To128( bSig, zSig1, &term1, &term2 );
3706         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3707         while ( (sbits64) rem1 < 0 ) {
3708             --zSig1;
3709             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3710         }
3711         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3712     }
3713     return
3714         roundAndPackFloatx80(
3715             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3716
3717 }
3718
3719 /*----------------------------------------------------------------------------
3720 | Returns the remainder of the extended double-precision floating-point value
3721 | `a' with respect to the corresponding value `b'.  The operation is performed
3722 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723 *----------------------------------------------------------------------------*/
3724
3725 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3726 {
3727     flag aSign, bSign, zSign;
3728     int32 aExp, bExp, expDiff;
3729     bits64 aSig0, aSig1, bSig;
3730     bits64 q, term0, term1, alternateASig0, alternateASig1;
3731     floatx80 z;
3732
3733     aSig0 = extractFloatx80Frac( a );
3734     aExp = extractFloatx80Exp( a );
3735     aSign = extractFloatx80Sign( a );
3736     bSig = extractFloatx80Frac( b );
3737     bExp = extractFloatx80Exp( b );
3738     bSign = extractFloatx80Sign( b );
3739     if ( aExp == 0x7FFF ) {
3740         if (    (bits64) ( aSig0<<1 )
3741              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3742             return propagateFloatx80NaN( a, b STATUS_VAR );
3743         }
3744         goto invalid;
3745     }
3746     if ( bExp == 0x7FFF ) {
3747         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3748         return a;
3749     }
3750     if ( bExp == 0 ) {
3751         if ( bSig == 0 ) {
3752  invalid:
3753             float_raise( float_flag_invalid STATUS_VAR);
3754             z.low = floatx80_default_nan_low;
3755             z.high = floatx80_default_nan_high;
3756             return z;
3757         }
3758         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759     }
3760     if ( aExp == 0 ) {
3761         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3762         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3763     }
3764     bSig |= LIT64( 0x8000000000000000 );
3765     zSign = aSign;
3766     expDiff = aExp - bExp;
3767     aSig1 = 0;
3768     if ( expDiff < 0 ) {
3769         if ( expDiff < -1 ) return a;
3770         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3771         expDiff = 0;
3772     }
3773     q = ( bSig <= aSig0 );
3774     if ( q ) aSig0 -= bSig;
3775     expDiff -= 64;
3776     while ( 0 < expDiff ) {
3777         q = estimateDiv128To64( aSig0, aSig1, bSig );
3778         q = ( 2 < q ) ? q - 2 : 0;
3779         mul64To128( bSig, q, &term0, &term1 );
3780         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3781         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3782         expDiff -= 62;
3783     }
3784     expDiff += 64;
3785     if ( 0 < expDiff ) {
3786         q = estimateDiv128To64( aSig0, aSig1, bSig );
3787         q = ( 2 < q ) ? q - 2 : 0;
3788         q >>= 64 - expDiff;
3789         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3790         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3791         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3792         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3793             ++q;
3794             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3795         }
3796     }
3797     else {
3798         term1 = 0;
3799         term0 = bSig;
3800     }
3801     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3802     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3803          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3804               && ( q & 1 ) )
3805        ) {
3806         aSig0 = alternateASig0;
3807         aSig1 = alternateASig1;
3808         zSign = ! zSign;
3809     }
3810     return
3811         normalizeRoundAndPackFloatx80(
3812             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3813
3814 }
3815
3816 /*----------------------------------------------------------------------------
3817 | Returns the square root of the extended double-precision floating-point
3818 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3819 | for Binary Floating-Point Arithmetic.
3820 *----------------------------------------------------------------------------*/
3821
3822 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3823 {
3824     flag aSign;
3825     int32 aExp, zExp;
3826     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3827     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3828     floatx80 z;
3829
3830     aSig0 = extractFloatx80Frac( a );
3831     aExp = extractFloatx80Exp( a );
3832     aSign = extractFloatx80Sign( a );
3833     if ( aExp == 0x7FFF ) {
3834         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3835         if ( ! aSign ) return a;
3836         goto invalid;
3837     }
3838     if ( aSign ) {
3839         if ( ( aExp | aSig0 ) == 0 ) return a;
3840  invalid:
3841         float_raise( float_flag_invalid STATUS_VAR);
3842         z.low = floatx80_default_nan_low;
3843         z.high = floatx80_default_nan_high;
3844         return z;
3845     }
3846     if ( aExp == 0 ) {
3847         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3848         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3849     }
3850     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3851     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3852     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3853     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3854     doubleZSig0 = zSig0<<1;
3855     mul64To128( zSig0, zSig0, &term0, &term1 );
3856     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3857     while ( (sbits64) rem0 < 0 ) {
3858         --zSig0;
3859         doubleZSig0 -= 2;
3860         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3861     }
3862     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3863     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3864         if ( zSig1 == 0 ) zSig1 = 1;
3865         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3866         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867         mul64To128( zSig1, zSig1, &term2, &term3 );
3868         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3869         while ( (sbits64) rem1 < 0 ) {
3870             --zSig1;
3871             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3872             term3 |= 1;
3873             term2 |= doubleZSig0;
3874             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3875         }
3876         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3877     }
3878     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3879     zSig0 |= doubleZSig0;
3880     return
3881         roundAndPackFloatx80(
3882             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3883
3884 }
3885
3886 /*----------------------------------------------------------------------------
3887 | Returns 1 if the extended double-precision floating-point value `a' is
3888 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3889 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3890 | Arithmetic.
3891 *----------------------------------------------------------------------------*/
3892
3893 flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3894 {
3895
3896     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3897               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3898          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3899               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3900        ) {
3901         if (    floatx80_is_signaling_nan( a )
3902              || floatx80_is_signaling_nan( b ) ) {
3903             float_raise( float_flag_invalid STATUS_VAR);
3904         }
3905         return 0;
3906     }
3907     return
3908            ( a.low == b.low )
3909         && (    ( a.high == b.high )
3910              || (    ( a.low == 0 )
3911                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3912            );
3913
3914 }
3915
3916 /*----------------------------------------------------------------------------
3917 | Returns 1 if the extended double-precision floating-point value `a' is
3918 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3919 | comparison is performed according to the IEC/IEEE Standard for Binary
3920 | Floating-Point Arithmetic.
3921 *----------------------------------------------------------------------------*/
3922
3923 flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3924 {
3925     flag aSign, bSign;
3926
3927     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3928               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3929          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3930               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3931        ) {
3932         float_raise( float_flag_invalid STATUS_VAR);
3933         return 0;
3934     }
3935     aSign = extractFloatx80Sign( a );
3936     bSign = extractFloatx80Sign( b );
3937     if ( aSign != bSign ) {
3938         return
3939                aSign
3940             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3941                  == 0 );
3942     }
3943     return
3944           aSign ? le128( b.high, b.low, a.high, a.low )
3945         : le128( a.high, a.low, b.high, b.low );
3946
3947 }
3948
3949 /*----------------------------------------------------------------------------
3950 | Returns 1 if the extended double-precision floating-point value `a' is
3951 | less than the corresponding value `b', and 0 otherwise.  The comparison
3952 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3953 | Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3957 {
3958     flag aSign, bSign;
3959
3960     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3961               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3962          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3963               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3964        ) {
3965         float_raise( float_flag_invalid STATUS_VAR);
3966         return 0;
3967     }
3968     aSign = extractFloatx80Sign( a );
3969     bSign = extractFloatx80Sign( b );
3970     if ( aSign != bSign ) {
3971         return
3972                aSign
3973             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3974                  != 0 );
3975     }
3976     return
3977           aSign ? lt128( b.high, b.low, a.high, a.low )
3978         : lt128( a.high, a.low, b.high, b.low );
3979
3980 }
3981
3982 /*----------------------------------------------------------------------------
3983 | Returns 1 if the extended double-precision floating-point value `a' is equal
3984 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3985 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3987 *----------------------------------------------------------------------------*/
3988
3989 flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3990 {
3991
3992     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3993               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3995               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996        ) {
3997         float_raise( float_flag_invalid STATUS_VAR);
3998         return 0;
3999     }
4000     return
4001            ( a.low == b.low )
4002         && (    ( a.high == b.high )
4003              || (    ( a.low == 0 )
4004                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4005            );
4006
4007 }
4008
4009 /*----------------------------------------------------------------------------
4010 | Returns 1 if the extended double-precision floating-point value `a' is less
4011 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4012 | do not cause an exception.  Otherwise, the comparison is performed according
4013 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4014 *----------------------------------------------------------------------------*/
4015
4016 flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4017 {
4018     flag aSign, bSign;
4019
4020     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4021               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4023               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024        ) {
4025         if (    floatx80_is_signaling_nan( a )
4026              || floatx80_is_signaling_nan( b ) ) {
4027             float_raise( float_flag_invalid STATUS_VAR);
4028         }
4029         return 0;
4030     }
4031     aSign = extractFloatx80Sign( a );
4032     bSign = extractFloatx80Sign( b );
4033     if ( aSign != bSign ) {
4034         return
4035                aSign
4036             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4037                  == 0 );
4038     }
4039     return
4040           aSign ? le128( b.high, b.low, a.high, a.low )
4041         : le128( a.high, a.low, b.high, b.low );
4042
4043 }
4044
4045 /*----------------------------------------------------------------------------
4046 | Returns 1 if the extended double-precision floating-point value `a' is less
4047 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4048 | an exception.  Otherwise, the comparison is performed according to the
4049 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4050 *----------------------------------------------------------------------------*/
4051
4052 flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4053 {
4054     flag aSign, bSign;
4055
4056     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4057               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4058          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4059               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4060        ) {
4061         if (    floatx80_is_signaling_nan( a )
4062              || floatx80_is_signaling_nan( b ) ) {
4063             float_raise( float_flag_invalid STATUS_VAR);
4064         }
4065         return 0;
4066     }
4067     aSign = extractFloatx80Sign( a );
4068     bSign = extractFloatx80Sign( b );
4069     if ( aSign != bSign ) {
4070         return
4071                aSign
4072             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4073                  != 0 );
4074     }
4075     return
4076           aSign ? lt128( b.high, b.low, a.high, a.low )
4077         : lt128( a.high, a.low, b.high, b.low );
4078
4079 }
4080
4081 #endif
4082
4083 #ifdef FLOAT128
4084
4085 /*----------------------------------------------------------------------------
4086 | Returns the result of converting the quadruple-precision floating-point
4087 | value `a' to the 32-bit two's complement integer format.  The conversion
4088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4089 | Arithmetic---which means in particular that the conversion is rounded
4090 | according to the current rounding mode.  If `a' is a NaN, the largest
4091 | positive integer is returned.  Otherwise, if the conversion overflows, the
4092 | largest integer with the same sign as `a' is returned.
4093 *----------------------------------------------------------------------------*/
4094
4095 int32 float128_to_int32( float128 a STATUS_PARAM )
4096 {
4097     flag aSign;
4098     int32 aExp, shiftCount;
4099     bits64 aSig0, aSig1;
4100
4101     aSig1 = extractFloat128Frac1( a );
4102     aSig0 = extractFloat128Frac0( a );
4103     aExp = extractFloat128Exp( a );
4104     aSign = extractFloat128Sign( a );
4105     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4106     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4107     aSig0 |= ( aSig1 != 0 );
4108     shiftCount = 0x4028 - aExp;
4109     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4110     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4111
4112 }
4113
4114 /*----------------------------------------------------------------------------
4115 | Returns the result of converting the quadruple-precision floating-point
4116 | value `a' to the 32-bit two's complement integer format.  The conversion
4117 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118 | Arithmetic, except that the conversion is always rounded toward zero.  If
4119 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4120 | conversion overflows, the largest integer with the same sign as `a' is
4121 | returned.
4122 *----------------------------------------------------------------------------*/
4123
4124 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4125 {
4126     flag aSign;
4127     int32 aExp, shiftCount;
4128     bits64 aSig0, aSig1, savedASig;
4129     int32 z;
4130
4131     aSig1 = extractFloat128Frac1( a );
4132     aSig0 = extractFloat128Frac0( a );
4133     aExp = extractFloat128Exp( a );
4134     aSign = extractFloat128Sign( a );
4135     aSig0 |= ( aSig1 != 0 );
4136     if ( 0x401E < aExp ) {
4137         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4138         goto invalid;
4139     }
4140     else if ( aExp < 0x3FFF ) {
4141         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4142         return 0;
4143     }
4144     aSig0 |= LIT64( 0x0001000000000000 );
4145     shiftCount = 0x402F - aExp;
4146     savedASig = aSig0;
4147     aSig0 >>= shiftCount;
4148     z = aSig0;
4149     if ( aSign ) z = - z;
4150     if ( ( z < 0 ) ^ aSign ) {
4151  invalid:
4152         float_raise( float_flag_invalid STATUS_VAR);
4153         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4154     }
4155     if ( ( aSig0<<shiftCount ) != savedASig ) {
4156         STATUS(float_exception_flags) |= float_flag_inexact;
4157     }
4158     return z;
4159
4160 }
4161
4162 /*----------------------------------------------------------------------------
4163 | Returns the result of converting the quadruple-precision floating-point
4164 | value `a' to the 64-bit two's complement integer format.  The conversion
4165 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166 | Arithmetic---which means in particular that the conversion is rounded
4167 | according to the current rounding mode.  If `a' is a NaN, the largest
4168 | positive integer is returned.  Otherwise, if the conversion overflows, the
4169 | largest integer with the same sign as `a' is returned.
4170 *----------------------------------------------------------------------------*/
4171
4172 int64 float128_to_int64( float128 a STATUS_PARAM )
4173 {
4174     flag aSign;
4175     int32 aExp, shiftCount;
4176     bits64 aSig0, aSig1;
4177
4178     aSig1 = extractFloat128Frac1( a );
4179     aSig0 = extractFloat128Frac0( a );
4180     aExp = extractFloat128Exp( a );
4181     aSign = extractFloat128Sign( a );
4182     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4183     shiftCount = 0x402F - aExp;
4184     if ( shiftCount <= 0 ) {
4185         if ( 0x403E < aExp ) {
4186             float_raise( float_flag_invalid STATUS_VAR);
4187             if (    ! aSign
4188                  || (    ( aExp == 0x7FFF )
4189                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4190                     )
4191                ) {
4192                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4193             }
4194             return (sbits64) LIT64( 0x8000000000000000 );
4195         }
4196         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4197     }
4198     else {
4199         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4200     }
4201     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4202
4203 }
4204
4205 /*----------------------------------------------------------------------------
4206 | Returns the result of converting the quadruple-precision floating-point
4207 | value `a' to the 64-bit two's complement integer format.  The conversion
4208 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4209 | Arithmetic, except that the conversion is always rounded toward zero.
4210 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4211 | the conversion overflows, the largest integer with the same sign as `a' is
4212 | returned.
4213 *----------------------------------------------------------------------------*/
4214
4215 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4216 {
4217     flag aSign;
4218     int32 aExp, shiftCount;
4219     bits64 aSig0, aSig1;
4220     int64 z;
4221
4222     aSig1 = extractFloat128Frac1( a );
4223     aSig0 = extractFloat128Frac0( a );
4224     aExp = extractFloat128Exp( a );
4225     aSign = extractFloat128Sign( a );
4226     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4227     shiftCount = aExp - 0x402F;
4228     if ( 0 < shiftCount ) {
4229         if ( 0x403E <= aExp ) {
4230             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4231             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4232                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4233                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4234             }
4235             else {
4236                 float_raise( float_flag_invalid STATUS_VAR);
4237                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4238                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4239                 }
4240             }
4241             return (sbits64) LIT64( 0x8000000000000000 );
4242         }
4243         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4244         if ( (bits64) ( aSig1<<shiftCount ) ) {
4245             STATUS(float_exception_flags) |= float_flag_inexact;
4246         }
4247     }
4248     else {
4249         if ( aExp < 0x3FFF ) {
4250             if ( aExp | aSig0 | aSig1 ) {
4251                 STATUS(float_exception_flags) |= float_flag_inexact;
4252             }
4253             return 0;
4254         }
4255         z = aSig0>>( - shiftCount );
4256         if (    aSig1
4257              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4258             STATUS(float_exception_flags) |= float_flag_inexact;
4259         }
4260     }
4261     if ( aSign ) z = - z;
4262     return z;
4263
4264 }
4265
4266 /*----------------------------------------------------------------------------
4267 | Returns the result of converting the quadruple-precision floating-point
4268 | value `a' to the single-precision floating-point format.  The conversion
4269 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4270 | Arithmetic.
4271 *----------------------------------------------------------------------------*/
4272
4273 float32 float128_to_float32( float128 a STATUS_PARAM )
4274 {
4275     flag aSign;
4276     int32 aExp;
4277     bits64 aSig0, aSig1;
4278     bits32 zSig;
4279
4280     aSig1 = extractFloat128Frac1( a );
4281     aSig0 = extractFloat128Frac0( a );
4282     aExp = extractFloat128Exp( a );
4283     aSign = extractFloat128Sign( a );
4284     if ( aExp == 0x7FFF ) {
4285         if ( aSig0 | aSig1 ) {
4286             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4287         }
4288         return packFloat32( aSign, 0xFF, 0 );
4289     }
4290     aSig0 |= ( aSig1 != 0 );
4291     shift64RightJamming( aSig0, 18, &aSig0 );
4292     zSig = aSig0;
4293     if ( aExp || zSig ) {
4294         zSig |= 0x40000000;
4295         aExp -= 0x3F81;
4296     }
4297     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4298
4299 }
4300
4301 /*----------------------------------------------------------------------------
4302 | Returns the result of converting the quadruple-precision floating-point
4303 | value `a' to the double-precision floating-point format.  The conversion
4304 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 | Arithmetic.
4306 *----------------------------------------------------------------------------*/
4307
4308 float64 float128_to_float64( float128 a STATUS_PARAM )
4309 {
4310     flag aSign;
4311     int32 aExp;
4312     bits64 aSig0, aSig1;
4313
4314     aSig1 = extractFloat128Frac1( a );
4315     aSig0 = extractFloat128Frac0( a );
4316     aExp = extractFloat128Exp( a );
4317     aSign = extractFloat128Sign( a );
4318     if ( aExp == 0x7FFF ) {
4319         if ( aSig0 | aSig1 ) {
4320             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4321         }
4322         return packFloat64( aSign, 0x7FF, 0 );
4323     }
4324     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4325     aSig0 |= ( aSig1 != 0 );
4326     if ( aExp || aSig0 ) {
4327         aSig0 |= LIT64( 0x4000000000000000 );
4328         aExp -= 0x3C01;
4329     }
4330     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4331
4332 }
4333
4334 #ifdef FLOATX80
4335
4336 /*----------------------------------------------------------------------------
4337 | Returns the result of converting the quadruple-precision floating-point
4338 | value `a' to the extended double-precision floating-point format.  The
4339 | conversion is performed according to the IEC/IEEE Standard for Binary
4340 | Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342
4343 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4344 {
4345     flag aSign;
4346     int32 aExp;
4347     bits64 aSig0, aSig1;
4348
4349     aSig1 = extractFloat128Frac1( a );
4350     aSig0 = extractFloat128Frac0( a );
4351     aExp = extractFloat128Exp( a );
4352     aSign = extractFloat128Sign( a );
4353     if ( aExp == 0x7FFF ) {
4354         if ( aSig0 | aSig1 ) {
4355             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4356         }
4357         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4358     }
4359     if ( aExp == 0 ) {
4360         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4361         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4362     }
4363     else {
4364         aSig0 |= LIT64( 0x0001000000000000 );
4365     }
4366     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4367     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4368
4369 }
4370
4371 #endif
4372
4373 /*----------------------------------------------------------------------------
4374 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4375 | returns the result as a quadruple-precision floating-point value.  The
4376 | operation is performed according to the IEC/IEEE Standard for Binary
4377 | Floating-Point Arithmetic.
4378 *----------------------------------------------------------------------------*/
4379
4380 float128 float128_round_to_int( float128 a STATUS_PARAM )
4381 {
4382     flag aSign;
4383     int32 aExp;
4384     bits64 lastBitMask, roundBitsMask;
4385     int8 roundingMode;
4386     float128 z;
4387
4388     aExp = extractFloat128Exp( a );
4389     if ( 0x402F <= aExp ) {
4390         if ( 0x406F <= aExp ) {
4391             if (    ( aExp == 0x7FFF )
4392                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4393                ) {
4394                 return propagateFloat128NaN( a, a STATUS_VAR );
4395             }
4396             return a;
4397         }
4398         lastBitMask = 1;
4399         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4400         roundBitsMask = lastBitMask - 1;
4401         z = a;
4402         roundingMode = STATUS(float_rounding_mode);
4403         if ( roundingMode == float_round_nearest_even ) {
4404             if ( lastBitMask ) {
4405                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4406                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4407             }
4408             else {
4409                 if ( (sbits64) z.low < 0 ) {
4410                     ++z.high;
4411                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4412                 }
4413             }
4414         }
4415         else if ( roundingMode != float_round_to_zero ) {
4416             if (   extractFloat128Sign( z )
4417                  ^ ( roundingMode == float_round_up ) ) {
4418                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4419             }
4420         }
4421         z.low &= ~ roundBitsMask;
4422     }
4423     else {
4424         if ( aExp < 0x3FFF ) {
4425             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4426             STATUS(float_exception_flags) |= float_flag_inexact;
4427             aSign = extractFloat128Sign( a );
4428             switch ( STATUS(float_rounding_mode) ) {
4429              case float_round_nearest_even:
4430                 if (    ( aExp == 0x3FFE )
4431                      && (   extractFloat128Frac0( a )
4432                           | extractFloat128Frac1( a ) )
4433                    ) {
4434                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4435                 }
4436                 break;
4437              case float_round_down:
4438                 return
4439                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4440                     : packFloat128( 0, 0, 0, 0 );
4441              case float_round_up:
4442                 return
4443                       aSign ? packFloat128( 1, 0, 0, 0 )
4444                     : packFloat128( 0, 0x3FFF, 0, 0 );
4445             }
4446             return packFloat128( aSign, 0, 0, 0 );
4447         }
4448         lastBitMask = 1;
4449         lastBitMask <<= 0x402F - aExp;
4450         roundBitsMask = lastBitMask - 1;
4451         z.low = 0;
4452         z.high = a.high;
4453         roundingMode = STATUS(float_rounding_mode);
4454         if ( roundingMode == float_round_nearest_even ) {
4455             z.high += lastBitMask>>1;
4456             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4457                 z.high &= ~ lastBitMask;
4458             }
4459         }
4460         else if ( roundingMode != float_round_to_zero ) {
4461             if (   extractFloat128Sign( z )
4462                  ^ ( roundingMode == float_round_up ) ) {
4463                 z.high |= ( a.low != 0 );
4464                 z.high += roundBitsMask;
4465             }
4466         }
4467         z.high &= ~ roundBitsMask;
4468     }
4469     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4470         STATUS(float_exception_flags) |= float_flag_inexact;
4471     }
4472     return z;
4473
4474 }
4475
4476 /*----------------------------------------------------------------------------
4477 | Returns the result of adding the absolute values of the quadruple-precision
4478 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4479 | before being returned.  `zSign' is ignored if the result is a NaN.
4480 | The addition is performed according to the IEC/IEEE Standard for Binary
4481 | Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483
4484 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4485 {
4486     int32 aExp, bExp, zExp;
4487     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4488     int32 expDiff;
4489
4490     aSig1 = extractFloat128Frac1( a );
4491     aSig0 = extractFloat128Frac0( a );
4492     aExp = extractFloat128Exp( a );
4493     bSig1 = extractFloat128Frac1( b );
4494     bSig0 = extractFloat128Frac0( b );
4495     bExp = extractFloat128Exp( b );
4496     expDiff = aExp - bExp;
4497     if ( 0 < expDiff ) {
4498         if ( aExp == 0x7FFF ) {
4499             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4500             return a;
4501         }
4502         if ( bExp == 0 ) {
4503             --expDiff;
4504         }
4505         else {
4506             bSig0 |= LIT64( 0x0001000000000000 );
4507         }
4508         shift128ExtraRightJamming(
4509             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4510         zExp = aExp;
4511     }
4512     else if ( expDiff < 0 ) {
4513         if ( bExp == 0x7FFF ) {
4514             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4515             return packFloat128( zSign, 0x7FFF, 0, 0 );
4516         }
4517         if ( aExp == 0 ) {
4518             ++expDiff;
4519         }
4520         else {
4521             aSig0 |= LIT64( 0x0001000000000000 );
4522         }
4523         shift128ExtraRightJamming(
4524             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4525         zExp = bExp;
4526     }
4527     else {
4528         if ( aExp == 0x7FFF ) {
4529             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4530                 return propagateFloat128NaN( a, b STATUS_VAR );
4531             }
4532             return a;
4533         }
4534         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4535         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4536         zSig2 = 0;
4537         zSig0 |= LIT64( 0x0002000000000000 );
4538         zExp = aExp;
4539         goto shiftRight1;
4540     }
4541     aSig0 |= LIT64( 0x0001000000000000 );
4542     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4543     --zExp;
4544     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4545     ++zExp;
4546  shiftRight1:
4547     shift128ExtraRightJamming(
4548         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4549  roundAndPack:
4550     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4551
4552 }
4553
4554 /*----------------------------------------------------------------------------
4555 | Returns the result of subtracting the absolute values of the quadruple-
4556 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4557 | difference is negated before being returned.  `zSign' is ignored if the
4558 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4559 | Standard for Binary Floating-Point Arithmetic.
4560 *----------------------------------------------------------------------------*/
4561
4562 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4563 {
4564     int32 aExp, bExp, zExp;
4565     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4566     int32 expDiff;
4567     float128 z;
4568
4569     aSig1 = extractFloat128Frac1( a );
4570     aSig0 = extractFloat128Frac0( a );
4571     aExp = extractFloat128Exp( a );
4572     bSig1 = extractFloat128Frac1( b );
4573     bSig0 = extractFloat128Frac0( b );
4574     bExp = extractFloat128Exp( b );
4575     expDiff = aExp - bExp;
4576     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4577     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4578     if ( 0 < expDiff ) goto aExpBigger;
4579     if ( expDiff < 0 ) goto bExpBigger;
4580     if ( aExp == 0x7FFF ) {
4581         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4582             return propagateFloat128NaN( a, b STATUS_VAR );
4583         }
4584         float_raise( float_flag_invalid STATUS_VAR);
4585         z.low = float128_default_nan_low;
4586         z.high = float128_default_nan_high;
4587         return z;
4588     }
4589     if ( aExp == 0 ) {
4590         aExp = 1;
4591         bExp = 1;
4592     }
4593     if ( bSig0 < aSig0 ) goto aBigger;
4594     if ( aSig0 < bSig0 ) goto bBigger;
4595     if ( bSig1 < aSig1 ) goto aBigger;
4596     if ( aSig1 < bSig1 ) goto bBigger;
4597     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4598  bExpBigger:
4599     if ( bExp == 0x7FFF ) {
4600         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4601         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4602     }
4603     if ( aExp == 0 ) {
4604         ++expDiff;
4605     }
4606     else {
4607         aSig0 |= LIT64( 0x4000000000000000 );
4608     }
4609     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4610     bSig0 |= LIT64( 0x4000000000000000 );
4611  bBigger:
4612     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4613     zExp = bExp;
4614     zSign ^= 1;
4615     goto normalizeRoundAndPack;
4616  aExpBigger:
4617     if ( aExp == 0x7FFF ) {
4618         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4619         return a;
4620     }
4621     if ( bExp == 0 ) {
4622         --expDiff;
4623     }
4624     else {
4625         bSig0 |= LIT64( 0x4000000000000000 );
4626     }
4627     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4628     aSig0 |= LIT64( 0x4000000000000000 );
4629  aBigger:
4630     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4631     zExp = aExp;
4632  normalizeRoundAndPack:
4633     --zExp;
4634     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the result of adding the quadruple-precision floating-point values
4640 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4641 | for Binary Floating-Point Arithmetic.
4642 *----------------------------------------------------------------------------*/
4643
4644 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4645 {
4646     flag aSign, bSign;
4647
4648     aSign = extractFloat128Sign( a );
4649     bSign = extractFloat128Sign( b );
4650     if ( aSign == bSign ) {
4651         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4652     }
4653     else {
4654         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4655     }
4656
4657 }
4658
4659 /*----------------------------------------------------------------------------
4660 | Returns the result of subtracting the quadruple-precision floating-point
4661 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4662 | Standard for Binary Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4664
4665 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4666 {
4667     flag aSign, bSign;
4668
4669     aSign = extractFloat128Sign( a );
4670     bSign = extractFloat128Sign( b );
4671     if ( aSign == bSign ) {
4672         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4673     }
4674     else {
4675         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4676     }
4677
4678 }
4679
4680 /*----------------------------------------------------------------------------
4681 | Returns the result of multiplying the quadruple-precision floating-point
4682 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4683 | Standard for Binary Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685
4686 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4687 {
4688     flag aSign, bSign, zSign;
4689     int32 aExp, bExp, zExp;
4690     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4691     float128 z;
4692
4693     aSig1 = extractFloat128Frac1( a );
4694     aSig0 = extractFloat128Frac0( a );
4695     aExp = extractFloat128Exp( a );
4696     aSign = extractFloat128Sign( a );
4697     bSig1 = extractFloat128Frac1( b );
4698     bSig0 = extractFloat128Frac0( b );
4699     bExp = extractFloat128Exp( b );
4700     bSign = extractFloat128Sign( b );
4701     zSign = aSign ^ bSign;
4702     if ( aExp == 0x7FFF ) {
4703         if (    ( aSig0 | aSig1 )
4704              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4705             return propagateFloat128NaN( a, b STATUS_VAR );
4706         }
4707         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4708         return packFloat128( zSign, 0x7FFF, 0, 0 );
4709     }
4710     if ( bExp == 0x7FFF ) {
4711         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4712         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4713  invalid:
4714             float_raise( float_flag_invalid STATUS_VAR);
4715             z.low = float128_default_nan_low;
4716             z.high = float128_default_nan_high;
4717             return z;
4718         }
4719         return packFloat128( zSign, 0x7FFF, 0, 0 );
4720     }
4721     if ( aExp == 0 ) {
4722         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4723         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4724     }
4725     if ( bExp == 0 ) {
4726         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4727         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4728     }
4729     zExp = aExp + bExp - 0x4000;
4730     aSig0 |= LIT64( 0x0001000000000000 );
4731     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4732     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4733     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4734     zSig2 |= ( zSig3 != 0 );
4735     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4736         shift128ExtraRightJamming(
4737             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4738         ++zExp;
4739     }
4740     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4741
4742 }
4743
4744 /*----------------------------------------------------------------------------
4745 | Returns the result of dividing the quadruple-precision floating-point value
4746 | `a' by the corresponding value `b'.  The operation is performed according to
4747 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4748 *----------------------------------------------------------------------------*/
4749
4750 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4751 {
4752     flag aSign, bSign, zSign;
4753     int32 aExp, bExp, zExp;
4754     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4756     float128 z;
4757
4758     aSig1 = extractFloat128Frac1( a );
4759     aSig0 = extractFloat128Frac0( a );
4760     aExp = extractFloat128Exp( a );
4761     aSign = extractFloat128Sign( a );
4762     bSig1 = extractFloat128Frac1( b );
4763     bSig0 = extractFloat128Frac0( b );
4764     bExp = extractFloat128Exp( b );
4765     bSign = extractFloat128Sign( b );
4766     zSign = aSign ^ bSign;
4767     if ( aExp == 0x7FFF ) {
4768         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4769         if ( bExp == 0x7FFF ) {
4770             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4771             goto invalid;
4772         }
4773         return packFloat128( zSign, 0x7FFF, 0, 0 );
4774     }
4775     if ( bExp == 0x7FFF ) {
4776         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777         return packFloat128( zSign, 0, 0, 0 );
4778     }
4779     if ( bExp == 0 ) {
4780         if ( ( bSig0 | bSig1 ) == 0 ) {
4781             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4782  invalid:
4783                 float_raise( float_flag_invalid STATUS_VAR);
4784                 z.low = float128_default_nan_low;
4785                 z.high = float128_default_nan_high;
4786                 return z;
4787             }
4788             float_raise( float_flag_divbyzero STATUS_VAR);
4789             return packFloat128( zSign, 0x7FFF, 0, 0 );
4790         }
4791         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4792     }
4793     if ( aExp == 0 ) {
4794         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4795         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4796     }
4797     zExp = aExp - bExp + 0x3FFD;
4798     shortShift128Left(
4799         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4800     shortShift128Left(
4801         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4802     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4803         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4804         ++zExp;
4805     }
4806     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4807     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4808     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4809     while ( (sbits64) rem0 < 0 ) {
4810         --zSig0;
4811         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4812     }
4813     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4814     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4815         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4816         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4817         while ( (sbits64) rem1 < 0 ) {
4818             --zSig1;
4819             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4820         }
4821         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4822     }
4823     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4824     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4825
4826 }
4827
4828 /*----------------------------------------------------------------------------
4829 | Returns the remainder of the quadruple-precision floating-point value `a'
4830 | with respect to the corresponding value `b'.  The operation is performed
4831 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4832 *----------------------------------------------------------------------------*/
4833
4834 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4835 {
4836     flag aSign, bSign, zSign;
4837     int32 aExp, bExp, expDiff;
4838     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4839     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4840     sbits64 sigMean0;
4841     float128 z;
4842
4843     aSig1 = extractFloat128Frac1( a );
4844     aSig0 = extractFloat128Frac0( a );
4845     aExp = extractFloat128Exp( a );
4846     aSign = extractFloat128Sign( a );
4847     bSig1 = extractFloat128Frac1( b );
4848     bSig0 = extractFloat128Frac0( b );
4849     bExp = extractFloat128Exp( b );
4850     bSign = extractFloat128Sign( b );
4851     if ( aExp == 0x7FFF ) {
4852         if (    ( aSig0 | aSig1 )
4853              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4854             return propagateFloat128NaN( a, b STATUS_VAR );
4855         }
4856         goto invalid;
4857     }
4858     if ( bExp == 0x7FFF ) {
4859         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4860         return a;
4861     }
4862     if ( bExp == 0 ) {
4863         if ( ( bSig0 | bSig1 ) == 0 ) {
4864  invalid:
4865             float_raise( float_flag_invalid STATUS_VAR);
4866             z.low = float128_default_nan_low;
4867             z.high = float128_default_nan_high;
4868             return z;
4869         }
4870         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4871     }
4872     if ( aExp == 0 ) {
4873         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4874         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4875     }
4876     expDiff = aExp - bExp;
4877     if ( expDiff < -1 ) return a;
4878     shortShift128Left(
4879         aSig0 | LIT64( 0x0001000000000000 ),
4880         aSig1,
4881         15 - ( expDiff < 0 ),
4882         &aSig0,
4883         &aSig1
4884     );
4885     shortShift128Left(
4886         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4887     q = le128( bSig0, bSig1, aSig0, aSig1 );
4888     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4889     expDiff -= 64;
4890     while ( 0 < expDiff ) {
4891         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4892         q = ( 4 < q ) ? q - 4 : 0;
4893         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4894         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4895         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4896         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4897         expDiff -= 61;
4898     }
4899     if ( -64 < expDiff ) {
4900         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4901         q = ( 4 < q ) ? q - 4 : 0;
4902         q >>= - expDiff;
4903         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904         expDiff += 52;
4905         if ( expDiff < 0 ) {
4906             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4907         }
4908         else {
4909             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4910         }
4911         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4912         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4913     }
4914     else {
4915         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4916         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4917     }
4918     do {
4919         alternateASig0 = aSig0;
4920         alternateASig1 = aSig1;
4921         ++q;
4922         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4923     } while ( 0 <= (sbits64) aSig0 );
4924     add128(
4925         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4926     if (    ( sigMean0 < 0 )
4927          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4928         aSig0 = alternateASig0;
4929         aSig1 = alternateASig1;
4930     }
4931     zSign = ( (sbits64) aSig0 < 0 );
4932     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4933     return
4934         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4935
4936 }
4937
4938 /*----------------------------------------------------------------------------
4939 | Returns the square root of the quadruple-precision floating-point value `a'.
4940 | The operation is performed according to the IEC/IEEE Standard for Binary
4941 | Floating-Point Arithmetic.
4942 *----------------------------------------------------------------------------*/
4943
4944 float128 float128_sqrt( float128 a STATUS_PARAM )
4945 {
4946     flag aSign;
4947     int32 aExp, zExp;
4948     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4949     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4950     float128 z;
4951
4952     aSig1 = extractFloat128Frac1( a );
4953     aSig0 = extractFloat128Frac0( a );
4954     aExp = extractFloat128Exp( a );
4955     aSign = extractFloat128Sign( a );
4956     if ( aExp == 0x7FFF ) {
4957         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4958         if ( ! aSign ) return a;
4959         goto invalid;
4960     }
4961     if ( aSign ) {
4962         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4963  invalid:
4964         float_raise( float_flag_invalid STATUS_VAR);
4965         z.low = float128_default_nan_low;
4966         z.high = float128_default_nan_high;
4967         return z;
4968     }
4969     if ( aExp == 0 ) {
4970         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4971         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4972     }
4973     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4974     aSig0 |= LIT64( 0x0001000000000000 );
4975     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4976     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4977     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4978     doubleZSig0 = zSig0<<1;
4979     mul64To128( zSig0, zSig0, &term0, &term1 );
4980     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4981     while ( (sbits64) rem0 < 0 ) {
4982         --zSig0;
4983         doubleZSig0 -= 2;
4984         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4985     }
4986     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4987     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4988         if ( zSig1 == 0 ) zSig1 = 1;
4989         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4990         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4991         mul64To128( zSig1, zSig1, &term2, &term3 );
4992         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4993         while ( (sbits64) rem1 < 0 ) {
4994             --zSig1;
4995             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4996             term3 |= 1;
4997             term2 |= doubleZSig0;
4998             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4999         }
5000         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5001     }
5002     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5003     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5004
5005 }
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5009 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5010 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011 *----------------------------------------------------------------------------*/
5012
5013 flag float128_eq( float128 a, float128 b STATUS_PARAM )
5014 {
5015
5016     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5017               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5018          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5019               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5020        ) {
5021         if (    float128_is_signaling_nan( a )
5022              || float128_is_signaling_nan( b ) ) {
5023             float_raise( float_flag_invalid STATUS_VAR);
5024         }
5025         return 0;
5026     }
5027     return
5028            ( a.low == b.low )
5029         && (    ( a.high == b.high )
5030              || (    ( a.low == 0 )
5031                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5032            );
5033
5034 }
5035
5036 /*----------------------------------------------------------------------------
5037 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5038 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5040 | Arithmetic.
5041 *----------------------------------------------------------------------------*/
5042
5043 flag float128_le( float128 a, float128 b STATUS_PARAM )
5044 {
5045     flag aSign, bSign;
5046
5047     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5048               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5049          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5050               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5051        ) {
5052         float_raise( float_flag_invalid STATUS_VAR);
5053         return 0;
5054     }
5055     aSign = extractFloat128Sign( a );
5056     bSign = extractFloat128Sign( b );
5057     if ( aSign != bSign ) {
5058         return
5059                aSign
5060             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5061                  == 0 );
5062     }
5063     return
5064           aSign ? le128( b.high, b.low, a.high, a.low )
5065         : le128( a.high, a.low, b.high, b.low );
5066
5067 }
5068
5069 /*----------------------------------------------------------------------------
5070 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5071 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5074
5075 flag float128_lt( float128 a, float128 b STATUS_PARAM )
5076 {
5077     flag aSign, bSign;
5078
5079     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5080               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5081          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5082               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5083        ) {
5084         float_raise( float_flag_invalid STATUS_VAR);
5085         return 0;
5086     }
5087     aSign = extractFloat128Sign( a );
5088     bSign = extractFloat128Sign( b );
5089     if ( aSign != bSign ) {
5090         return
5091                aSign
5092             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5093                  != 0 );
5094     }
5095     return
5096           aSign ? lt128( b.high, b.low, a.high, a.low )
5097         : lt128( a.high, a.low, b.high, b.low );
5098
5099 }
5100
5101 /*----------------------------------------------------------------------------
5102 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5103 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5104 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106 *----------------------------------------------------------------------------*/
5107
5108 flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5109 {
5110
5111     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5112               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5113          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5114               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5115        ) {
5116         float_raise( float_flag_invalid STATUS_VAR);
5117         return 0;
5118     }
5119     return
5120            ( a.low == b.low )
5121         && (    ( a.high == b.high )
5122              || (    ( a.low == 0 )
5123                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5124            );
5125
5126 }
5127
5128 /*----------------------------------------------------------------------------
5129 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5130 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5131 | cause an exception.  Otherwise, the comparison is performed according to the
5132 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5133 *----------------------------------------------------------------------------*/
5134
5135 flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5136 {
5137     flag aSign, bSign;
5138
5139     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5140               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5142               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5143        ) {
5144         if (    float128_is_signaling_nan( a )
5145              || float128_is_signaling_nan( b ) ) {
5146             float_raise( float_flag_invalid STATUS_VAR);
5147         }
5148         return 0;
5149     }
5150     aSign = extractFloat128Sign( a );
5151     bSign = extractFloat128Sign( b );
5152     if ( aSign != bSign ) {
5153         return
5154                aSign
5155             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5156                  == 0 );
5157     }
5158     return
5159           aSign ? le128( b.high, b.low, a.high, a.low )
5160         : le128( a.high, a.low, b.high, b.low );
5161
5162 }
5163
5164 /*----------------------------------------------------------------------------
5165 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5166 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5167 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5168 | Standard for Binary Floating-Point Arithmetic.
5169 *----------------------------------------------------------------------------*/
5170
5171 flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5172 {
5173     flag aSign, bSign;
5174
5175     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5176               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5177          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5178               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5179        ) {
5180         if (    float128_is_signaling_nan( a )
5181              || float128_is_signaling_nan( b ) ) {
5182             float_raise( float_flag_invalid STATUS_VAR);
5183         }
5184         return 0;
5185     }
5186     aSign = extractFloat128Sign( a );
5187     bSign = extractFloat128Sign( b );
5188     if ( aSign != bSign ) {
5189         return
5190                aSign
5191             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5192                  != 0 );
5193     }
5194     return
5195           aSign ? lt128( b.high, b.low, a.high, a.low )
5196         : lt128( a.high, a.low, b.high, b.low );
5197
5198 }
5199
5200 #endif
5201
5202 /* misc functions */
5203 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5204 {
5205     return int64_to_float32(a STATUS_VAR);
5206 }
5207
5208 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5209 {
5210     return int64_to_float64(a STATUS_VAR);
5211 }
5212
5213 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5214 {
5215     int64_t v;
5216     unsigned int res;
5217
5218     v = float32_to_int64(a STATUS_VAR);
5219     if (v < 0) {
5220         res = 0;
5221         float_raise( float_flag_invalid STATUS_VAR);
5222     } else if (v > 0xffffffff) {
5223         res = 0xffffffff;
5224         float_raise( float_flag_invalid STATUS_VAR);
5225     } else {
5226         res = v;
5227     }
5228     return res;
5229 }
5230
5231 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5232 {
5233     int64_t v;
5234     unsigned int res;
5235
5236     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5237     if (v < 0) {
5238         res = 0;
5239         float_raise( float_flag_invalid STATUS_VAR);
5240     } else if (v > 0xffffffff) {
5241         res = 0xffffffff;
5242         float_raise( float_flag_invalid STATUS_VAR);
5243     } else {
5244         res = v;
5245     }
5246     return res;
5247 }
5248
5249 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5250 {
5251     int64_t v;
5252     unsigned int res;
5253
5254     v = float64_to_int64(a STATUS_VAR);
5255     if (v < 0) {
5256         res = 0;
5257         float_raise( float_flag_invalid STATUS_VAR);
5258     } else if (v > 0xffffffff) {
5259         res = 0xffffffff;
5260         float_raise( float_flag_invalid STATUS_VAR);
5261     } else {
5262         res = v;
5263     }
5264     return res;
5265 }
5266
5267 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5268 {
5269     int64_t v;
5270     unsigned int res;
5271
5272     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5273     if (v < 0) {
5274         res = 0;
5275         float_raise( float_flag_invalid STATUS_VAR);
5276     } else if (v > 0xffffffff) {
5277         res = 0xffffffff;
5278         float_raise( float_flag_invalid STATUS_VAR);
5279     } else {
5280         res = v;
5281     }
5282     return res;
5283 }
5284
5285 #define COMPARE(s, nan_exp)                                                  \
5286 INLINE char float ## s ## _compare_internal( float ## s a, float ## s b,     \
5287                                       int is_quiet STATUS_PARAM )            \
5288 {                                                                            \
5289     flag aSign, bSign;                                                       \
5290                                                                              \
5291     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5292          extractFloat ## s ## Frac( a ) ) ||                                 \
5293         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5294           extractFloat ## s ## Frac( b ) )) {                                \
5295         if (!is_quiet ||                                                     \
5296             float ## s ## _is_signaling_nan( a ) ||                          \
5297             float ## s ## _is_signaling_nan( b ) ) {                         \
5298             float_raise( float_flag_invalid STATUS_VAR);                     \
5299         }                                                                    \
5300         return float_relation_unordered;                                     \
5301     }                                                                        \
5302     aSign = extractFloat ## s ## Sign( a );                                  \
5303     bSign = extractFloat ## s ## Sign( b );                                  \
5304     if ( aSign != bSign ) {                                                  \
5305         if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5306             /* zero case */                                                  \
5307             return float_relation_equal;                                     \
5308         } else {                                                             \
5309             return 1 - (2 * aSign);                                          \
5310         }                                                                    \
5311     } else {                                                                 \
5312         if (a == b) {                                                        \
5313             return float_relation_equal;                                     \
5314         } else {                                                             \
5315             return 1 - 2 * (aSign ^ ( a < b ));                              \
5316         }                                                                    \
5317     }                                                                        \
5318 }                                                                            \
5319                                                                              \
5320 char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )       \
5321 {                                                                            \
5322     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5323 }                                                                            \
5324                                                                              \
5325 char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5326 {                                                                            \
5327     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5328 }
5329
5330 COMPARE(32, 0xff)
5331 COMPARE(64, 0x7ff)