git.maemo.org Git - qemu/blob - fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 #ifdef FLOATX80
  58 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  59 {
  60     STATUS(floatx80_rounding_precision) = val;
  61 }
  62 #endif
  63
  64 /*----------------------------------------------------------------------------
  65 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  66 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  67 | input.  If `zSign' is 1, the input is negated before being converted to an
  68 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  69 | is simply rounded to an integer, with the inexact exception raised if the
  70 | input cannot be represented exactly as an integer.  However, if the fixed-
  71 | point input is too large, the invalid exception is raised and the largest
  72 | positive or negative integer is returned.
  73 *----------------------------------------------------------------------------*/
  74
  75 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  76 {
  77     int8 roundingMode;
  78     flag roundNearestEven;
  79     int8 roundIncrement, roundBits;
  80     int32 z;
  81
  82     roundingMode = STATUS(float_rounding_mode);
  83     roundNearestEven = ( roundingMode == float_round_nearest_even );
  84     roundIncrement = 0x40;
  85     if ( ! roundNearestEven ) {
  86         if ( roundingMode == float_round_to_zero ) {
  87             roundIncrement = 0;
  88         }
  89         else {
  90             roundIncrement = 0x7F;
  91             if ( zSign ) {
  92                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  93             }
  94             else {
  95                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  96             }
  97         }
  98     }
  99     roundBits = absZ & 0x7F;
 100     absZ = ( absZ + roundIncrement )>>7;
 101     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 102     z = absZ;
 103     if ( zSign ) z = - z;
 104     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 105         float_raise( float_flag_invalid STATUS_VAR);
 106         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 107     }
 108     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 109     return z;
 110
 111 }
 112
 113 /*----------------------------------------------------------------------------
 114 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 115 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 116 | and returns the properly rounded 64-bit integer corresponding to the input.
 117 | If `zSign' is 1, the input is negated before being converted to an integer.
 118 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 119 | the inexact exception raised if the input cannot be represented exactly as
 120 | an integer.  However, if the fixed-point input is too large, the invalid
 121 | exception is raised and the largest positive or negative integer is
 122 | returned.
 123 *----------------------------------------------------------------------------*/
 124
 125 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 126 {
 127     int8 roundingMode;
 128     flag roundNearestEven, increment;
 129     int64 z;
 130
 131     roundingMode = STATUS(float_rounding_mode);
 132     roundNearestEven = ( roundingMode == float_round_nearest_even );
 133     increment = ( (sbits64) absZ1 < 0 );
 134     if ( ! roundNearestEven ) {
 135         if ( roundingMode == float_round_to_zero ) {
 136             increment = 0;
 137         }
 138         else {
 139             if ( zSign ) {
 140                 increment = ( roundingMode == float_round_down ) && absZ1;
 141             }
 142             else {
 143                 increment = ( roundingMode == float_round_up ) && absZ1;
 144             }
 145         }
 146     }
 147     if ( increment ) {
 148         ++absZ0;
 149         if ( absZ0 == 0 ) goto overflow;
 150         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 151     }
 152     z = absZ0;
 153     if ( zSign ) z = - z;
 154     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 155  overflow:
 156         float_raise( float_flag_invalid STATUS_VAR);
 157         return
 158               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 159             : LIT64( 0x7FFFFFFFFFFFFFFF );
 160     }
 161     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 162     return z;
 163
 164 }
 165
 166 /*----------------------------------------------------------------------------
 167 | Returns the fraction bits of the single-precision floating-point value `a'.
 168 *----------------------------------------------------------------------------*/
 169
 170 INLINE bits32 extractFloat32Frac( float32 a )
 171 {
 172
 173     return a & 0x007FFFFF;
 174
 175 }
 176
 177 /*----------------------------------------------------------------------------
 178 | Returns the exponent bits of the single-precision floating-point value `a'.
 179 *----------------------------------------------------------------------------*/
 180
 181 INLINE int16 extractFloat32Exp( float32 a )
 182 {
 183
 184     return ( a>>23 ) & 0xFF;
 185
 186 }
 187
 188 /*----------------------------------------------------------------------------
 189 | Returns the sign bit of the single-precision floating-point value `a'.
 190 *----------------------------------------------------------------------------*/
 191
 192 INLINE flag extractFloat32Sign( float32 a )
 193 {
 194
 195     return a>>31;
 196
 197 }
 198
 199 /*----------------------------------------------------------------------------
 200 | Normalizes the subnormal single-precision floating-point value represented
 201 | by the denormalized significand `aSig'.  The normalized exponent and
 202 | significand are stored at the locations pointed to by `zExpPtr' and
 203 | `zSigPtr', respectively.
 204 *----------------------------------------------------------------------------*/
 205
 206 static void
 207  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 208 {
 209     int8 shiftCount;
 210
 211     shiftCount = countLeadingZeros32( aSig ) - 8;
 212     *zSigPtr = aSig<<shiftCount;
 213     *zExpPtr = 1 - shiftCount;
 214
 215 }
 216
 217 /*----------------------------------------------------------------------------
 218 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 219 | single-precision floating-point value, returning the result.  After being
 220 | shifted into the proper positions, the three fields are simply added
 221 | together to form the result.  This means that any integer portion of `zSig'
 222 | will be added into the exponent.  Since a properly normalized significand
 223 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 224 | than the desired result exponent whenever `zSig' is a complete, normalized
 225 | significand.
 226 *----------------------------------------------------------------------------*/
 227
 228 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 229 {
 230
 231     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 232
 233 }
 234
 235 /*----------------------------------------------------------------------------
 236 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 237 | and significand `zSig', and returns the proper single-precision floating-
 238 | point value corresponding to the abstract input.  Ordinarily, the abstract
 239 | value is simply rounded and packed into the single-precision format, with
 240 | the inexact exception raised if the abstract input cannot be represented
 241 | exactly.  However, if the abstract value is too large, the overflow and
 242 | inexact exceptions are raised and an infinity or maximal finite value is
 243 | returned.  If the abstract value is too small, the input value is rounded to
 244 | a subnormal number, and the underflow and inexact exceptions are raised if
 245 | the abstract input cannot be represented exactly as a subnormal single-
 246 | precision floating-point number.
 247 |     The input significand `zSig' has its binary point between bits 30
 248 | and 29, which is 7 bits to the left of the usual location.  This shifted
 249 | significand must be normalized or smaller.  If `zSig' is not normalized,
 250 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 251 | and it must not require rounding.  In the usual case that `zSig' is
 252 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 253 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 254 | Binary Floating-Point Arithmetic.
 255 *----------------------------------------------------------------------------*/
 256
 257 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 258 {
 259     int8 roundingMode;
 260     flag roundNearestEven;
 261     int8 roundIncrement, roundBits;
 262     flag isTiny;
 263
 264     roundingMode = STATUS(float_rounding_mode);
 265     roundNearestEven = ( roundingMode == float_round_nearest_even );
 266     roundIncrement = 0x40;
 267     if ( ! roundNearestEven ) {
 268         if ( roundingMode == float_round_to_zero ) {
 269             roundIncrement = 0;
 270         }
 271         else {
 272             roundIncrement = 0x7F;
 273             if ( zSign ) {
 274                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 275             }
 276             else {
 277                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 278             }
 279         }
 280     }
 281     roundBits = zSig & 0x7F;
 282     if ( 0xFD <= (bits16) zExp ) {
 283         if (    ( 0xFD < zExp )
 284              || (    ( zExp == 0xFD )
 285                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 286            ) {
 287             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 288             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 289         }
 290         if ( zExp < 0 ) {
 291             isTiny =
 292                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 293                 || ( zExp < -1 )
 294                 || ( zSig + roundIncrement < 0x80000000 );
 295             shift32RightJamming( zSig, - zExp, &zSig );
 296             zExp = 0;
 297             roundBits = zSig & 0x7F;
 298             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 299         }
 300     }
 301     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 302     zSig = ( zSig + roundIncrement )>>7;
 303     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 304     if ( zSig == 0 ) zExp = 0;
 305     return packFloat32( zSign, zExp, zSig );
 306
 307 }
 308
 309 /*----------------------------------------------------------------------------
 310 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 311 | and significand `zSig', and returns the proper single-precision floating-
 312 | point value corresponding to the abstract input.  This routine is just like
 313 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 314 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 315 | floating-point exponent.
 316 *----------------------------------------------------------------------------*/
 317
 318 static float32
 319  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 320 {
 321     int8 shiftCount;
 322
 323     shiftCount = countLeadingZeros32( zSig ) - 1;
 324     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 325
 326 }
 327
 328 /*----------------------------------------------------------------------------
 329 | Returns the fraction bits of the double-precision floating-point value `a'.
 330 *----------------------------------------------------------------------------*/
 331
 332 INLINE bits64 extractFloat64Frac( float64 a )
 333 {
 334
 335     return a & LIT64( 0x000FFFFFFFFFFFFF );
 336
 337 }
 338
 339 /*----------------------------------------------------------------------------
 340 | Returns the exponent bits of the double-precision floating-point value `a'.
 341 *----------------------------------------------------------------------------*/
 342
 343 INLINE int16 extractFloat64Exp( float64 a )
 344 {
 345
 346     return ( a>>52 ) & 0x7FF;
 347
 348 }
 349
 350 /*----------------------------------------------------------------------------
 351 | Returns the sign bit of the double-precision floating-point value `a'.
 352 *----------------------------------------------------------------------------*/
 353
 354 INLINE flag extractFloat64Sign( float64 a )
 355 {
 356
 357     return a>>63;
 358
 359 }
 360
 361 /*----------------------------------------------------------------------------
 362 | Normalizes the subnormal double-precision floating-point value represented
 363 | by the denormalized significand `aSig'.  The normalized exponent and
 364 | significand are stored at the locations pointed to by `zExpPtr' and
 365 | `zSigPtr', respectively.
 366 *----------------------------------------------------------------------------*/
 367
 368 static void
 369  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 370 {
 371     int8 shiftCount;
 372
 373     shiftCount = countLeadingZeros64( aSig ) - 11;
 374     *zSigPtr = aSig<<shiftCount;
 375     *zExpPtr = 1 - shiftCount;
 376
 377 }
 378
 379 /*----------------------------------------------------------------------------
 380 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 381 | double-precision floating-point value, returning the result.  After being
 382 | shifted into the proper positions, the three fields are simply added
 383 | together to form the result.  This means that any integer portion of `zSig'
 384 | will be added into the exponent.  Since a properly normalized significand
 385 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 386 | than the desired result exponent whenever `zSig' is a complete, normalized
 387 | significand.
 388 *----------------------------------------------------------------------------*/
 389
 390 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 391 {
 392
 393     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 394
 395 }
 396
 397 /*----------------------------------------------------------------------------
 398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 399 | and significand `zSig', and returns the proper double-precision floating-
 400 | point value corresponding to the abstract input.  Ordinarily, the abstract
 401 | value is simply rounded and packed into the double-precision format, with
 402 | the inexact exception raised if the abstract input cannot be represented
 403 | exactly.  However, if the abstract value is too large, the overflow and
 404 | inexact exceptions are raised and an infinity or maximal finite value is
 405 | returned.  If the abstract value is too small, the input value is rounded
 406 | to a subnormal number, and the underflow and inexact exceptions are raised
 407 | if the abstract input cannot be represented exactly as a subnormal double-
 408 | precision floating-point number.
 409 |     The input significand `zSig' has its binary point between bits 62
 410 | and 61, which is 10 bits to the left of the usual location.  This shifted
 411 | significand must be normalized or smaller.  If `zSig' is not normalized,
 412 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 413 | and it must not require rounding.  In the usual case that `zSig' is
 414 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 415 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 416 | Binary Floating-Point Arithmetic.
 417 *----------------------------------------------------------------------------*/
 418
 419 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 420 {
 421     int8 roundingMode;
 422     flag roundNearestEven;
 423     int16 roundIncrement, roundBits;
 424     flag isTiny;
 425
 426     roundingMode = STATUS(float_rounding_mode);
 427     roundNearestEven = ( roundingMode == float_round_nearest_even );
 428     roundIncrement = 0x200;
 429     if ( ! roundNearestEven ) {
 430         if ( roundingMode == float_round_to_zero ) {
 431             roundIncrement = 0;
 432         }
 433         else {
 434             roundIncrement = 0x3FF;
 435             if ( zSign ) {
 436                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 437             }
 438             else {
 439                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 440             }
 441         }
 442     }
 443     roundBits = zSig & 0x3FF;
 444     if ( 0x7FD <= (bits16) zExp ) {
 445         if (    ( 0x7FD < zExp )
 446              || (    ( zExp == 0x7FD )
 447                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 448            ) {
 449             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 450             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 451         }
 452         if ( zExp < 0 ) {
 453             isTiny =
 454                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 455                 || ( zExp < -1 )
 456                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 457             shift64RightJamming( zSig, - zExp, &zSig );
 458             zExp = 0;
 459             roundBits = zSig & 0x3FF;
 460             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 461         }
 462     }
 463     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 464     zSig = ( zSig + roundIncrement )>>10;
 465     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 466     if ( zSig == 0 ) zExp = 0;
 467     return packFloat64( zSign, zExp, zSig );
 468
 469 }
 470
 471 /*----------------------------------------------------------------------------
 472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 473 | and significand `zSig', and returns the proper double-precision floating-
 474 | point value corresponding to the abstract input.  This routine is just like
 475 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 476 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 477 | floating-point exponent.
 478 *----------------------------------------------------------------------------*/
 479
 480 static float64
 481  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 482 {
 483     int8 shiftCount;
 484
 485     shiftCount = countLeadingZeros64( zSig ) - 1;
 486     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 487
 488 }
 489
 490 #ifdef FLOATX80
 491
 492 /*----------------------------------------------------------------------------
 493 | Returns the fraction bits of the extended double-precision floating-point
 494 | value `a'.
 495 *----------------------------------------------------------------------------*/
 496
 497 INLINE bits64 extractFloatx80Frac( floatx80 a )
 498 {
 499
 500     return a.low;
 501
 502 }
 503
 504 /*----------------------------------------------------------------------------
 505 | Returns the exponent bits of the extended double-precision floating-point
 506 | value `a'.
 507 *----------------------------------------------------------------------------*/
 508
 509 INLINE int32 extractFloatx80Exp( floatx80 a )
 510 {
 511
 512     return a.high & 0x7FFF;
 513
 514 }
 515
 516 /*----------------------------------------------------------------------------
 517 | Returns the sign bit of the extended double-precision floating-point value
 518 | `a'.
 519 *----------------------------------------------------------------------------*/
 520
 521 INLINE flag extractFloatx80Sign( floatx80 a )
 522 {
 523
 524     return a.high>>15;
 525
 526 }
 527
 528 /*----------------------------------------------------------------------------
 529 | Normalizes the subnormal extended double-precision floating-point value
 530 | represented by the denormalized significand `aSig'.  The normalized exponent
 531 | and significand are stored at the locations pointed to by `zExpPtr' and
 532 | `zSigPtr', respectively.
 533 *----------------------------------------------------------------------------*/
 534
 535 static void
 536  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 537 {
 538     int8 shiftCount;
 539
 540     shiftCount = countLeadingZeros64( aSig );
 541     *zSigPtr = aSig<<shiftCount;
 542     *zExpPtr = 1 - shiftCount;
 543
 544 }
 545
 546 /*----------------------------------------------------------------------------
 547 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 548 | extended double-precision floating-point value, returning the result.
 549 *----------------------------------------------------------------------------*/
 550
 551 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 552 {
 553     floatx80 z;
 554
 555     z.low = zSig;
 556     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 557     return z;
 558
 559 }
 560
 561 /*----------------------------------------------------------------------------
 562 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 563 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 564 | and returns the proper extended double-precision floating-point value
 565 | corresponding to the abstract input.  Ordinarily, the abstract value is
 566 | rounded and packed into the extended double-precision format, with the
 567 | inexact exception raised if the abstract input cannot be represented
 568 | exactly.  However, if the abstract value is too large, the overflow and
 569 | inexact exceptions are raised and an infinity or maximal finite value is
 570 | returned.  If the abstract value is too small, the input value is rounded to
 571 | a subnormal number, and the underflow and inexact exceptions are raised if
 572 | the abstract input cannot be represented exactly as a subnormal extended
 573 | double-precision floating-point number.
 574 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 575 | number of bits as single or double precision, respectively.  Otherwise, the
 576 | result is rounded to the full precision of the extended double-precision
 577 | format.
 578 |     The input significand must be normalized or smaller.  If the input
 579 | significand is not normalized, `zExp' must be 0; in that case, the result
 580 | returned is a subnormal number, and it must not require rounding.  The
 581 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 582 | Floating-Point Arithmetic.
 583 *----------------------------------------------------------------------------*/
 584
 585 static floatx80
 586  roundAndPackFloatx80(
 587      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 588  STATUS_PARAM)
 589 {
 590     int8 roundingMode;
 591     flag roundNearestEven, increment, isTiny;
 592     int64 roundIncrement, roundMask, roundBits;
 593
 594     roundingMode = STATUS(float_rounding_mode);
 595     roundNearestEven = ( roundingMode == float_round_nearest_even );
 596     if ( roundingPrecision == 80 ) goto precision80;
 597     if ( roundingPrecision == 64 ) {
 598         roundIncrement = LIT64( 0x0000000000000400 );
 599         roundMask = LIT64( 0x00000000000007FF );
 600     }
 601     else if ( roundingPrecision == 32 ) {
 602         roundIncrement = LIT64( 0x0000008000000000 );
 603         roundMask = LIT64( 0x000000FFFFFFFFFF );
 604     }
 605     else {
 606         goto precision80;
 607     }
 608     zSig0 |= ( zSig1 != 0 );
 609     if ( ! roundNearestEven ) {
 610         if ( roundingMode == float_round_to_zero ) {
 611             roundIncrement = 0;
 612         }
 613         else {
 614             roundIncrement = roundMask;
 615             if ( zSign ) {
 616                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 617             }
 618             else {
 619                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 620             }
 621         }
 622     }
 623     roundBits = zSig0 & roundMask;
 624     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 625         if (    ( 0x7FFE < zExp )
 626              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 627            ) {
 628             goto overflow;
 629         }
 630         if ( zExp <= 0 ) {
 631             isTiny =
 632                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 633                 || ( zExp < 0 )
 634                 || ( zSig0 <= zSig0 + roundIncrement );
 635             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 636             zExp = 0;
 637             roundBits = zSig0 & roundMask;
 638             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 639             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 640             zSig0 += roundIncrement;
 641             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 642             roundIncrement = roundMask + 1;
 643             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 644                 roundMask |= roundIncrement;
 645             }
 646             zSig0 &= ~ roundMask;
 647             return packFloatx80( zSign, zExp, zSig0 );
 648         }
 649     }
 650     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 651     zSig0 += roundIncrement;
 652     if ( zSig0 < roundIncrement ) {
 653         ++zExp;
 654         zSig0 = LIT64( 0x8000000000000000 );
 655     }
 656     roundIncrement = roundMask + 1;
 657     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 658         roundMask |= roundIncrement;
 659     }
 660     zSig0 &= ~ roundMask;
 661     if ( zSig0 == 0 ) zExp = 0;
 662     return packFloatx80( zSign, zExp, zSig0 );
 663  precision80:
 664     increment = ( (sbits64) zSig1 < 0 );
 665     if ( ! roundNearestEven ) {
 666         if ( roundingMode == float_round_to_zero ) {
 667             increment = 0;
 668         }
 669         else {
 670             if ( zSign ) {
 671                 increment = ( roundingMode == float_round_down ) && zSig1;
 672             }
 673             else {
 674                 increment = ( roundingMode == float_round_up ) && zSig1;
 675             }
 676         }
 677     }
 678     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 679         if (    ( 0x7FFE < zExp )
 680              || (    ( zExp == 0x7FFE )
 681                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 682                   && increment
 683                 )
 684            ) {
 685             roundMask = 0;
 686  overflow:
 687             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 688             if (    ( roundingMode == float_round_to_zero )
 689                  || ( zSign && ( roundingMode == float_round_up ) )
 690                  || ( ! zSign && ( roundingMode == float_round_down ) )
 691                ) {
 692                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 693             }
 694             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 695         }
 696         if ( zExp <= 0 ) {
 697             isTiny =
 698                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 699                 || ( zExp < 0 )
 700                 || ! increment
 701                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 702             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 703             zExp = 0;
 704             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 705             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 706             if ( roundNearestEven ) {
 707                 increment = ( (sbits64) zSig1 < 0 );
 708             }
 709             else {
 710                 if ( zSign ) {
 711                     increment = ( roundingMode == float_round_down ) && zSig1;
 712                 }
 713                 else {
 714                     increment = ( roundingMode == float_round_up ) && zSig1;
 715                 }
 716             }
 717             if ( increment ) {
 718                 ++zSig0;
 719                 zSig0 &=
 720                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 721                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 722             }
 723             return packFloatx80( zSign, zExp, zSig0 );
 724         }
 725     }
 726     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 727     if ( increment ) {
 728         ++zSig0;
 729         if ( zSig0 == 0 ) {
 730             ++zExp;
 731             zSig0 = LIT64( 0x8000000000000000 );
 732         }
 733         else {
 734             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 735         }
 736     }
 737     else {
 738         if ( zSig0 == 0 ) zExp = 0;
 739     }
 740     return packFloatx80( zSign, zExp, zSig0 );
 741
 742 }
 743
 744 /*----------------------------------------------------------------------------
 745 | Takes an abstract floating-point value having sign `zSign', exponent
 746 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 747 | and returns the proper extended double-precision floating-point value
 748 | corresponding to the abstract input.  This routine is just like
 749 | `roundAndPackFloatx80' except that the input significand does not have to be
 750 | normalized.
 751 *----------------------------------------------------------------------------*/
 752
 753 static floatx80
 754  normalizeRoundAndPackFloatx80(
 755      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 756  STATUS_PARAM)
 757 {
 758     int8 shiftCount;
 759
 760     if ( zSig0 == 0 ) {
 761         zSig0 = zSig1;
 762         zSig1 = 0;
 763         zExp -= 64;
 764     }
 765     shiftCount = countLeadingZeros64( zSig0 );
 766     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 767     zExp -= shiftCount;
 768     return
 769         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 770
 771 }
 772
 773 #endif
 774
 775 #ifdef FLOAT128
 776
 777 /*----------------------------------------------------------------------------
 778 | Returns the least-significant 64 fraction bits of the quadruple-precision
 779 | floating-point value `a'.
 780 *----------------------------------------------------------------------------*/
 781
 782 INLINE bits64 extractFloat128Frac1( float128 a )
 783 {
 784
 785     return a.low;
 786
 787 }
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns the most-significant 48 fraction bits of the quadruple-precision
 791 | floating-point value `a'.
 792 *----------------------------------------------------------------------------*/
 793
 794 INLINE bits64 extractFloat128Frac0( float128 a )
 795 {
 796
 797     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 798
 799 }
 800
 801 /*----------------------------------------------------------------------------
 802 | Returns the exponent bits of the quadruple-precision floating-point value
 803 | `a'.
 804 *----------------------------------------------------------------------------*/
 805
 806 INLINE int32 extractFloat128Exp( float128 a )
 807 {
 808
 809     return ( a.high>>48 ) & 0x7FFF;
 810
 811 }
 812
 813 /*----------------------------------------------------------------------------
 814 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 815 *----------------------------------------------------------------------------*/
 816
 817 INLINE flag extractFloat128Sign( float128 a )
 818 {
 819
 820     return a.high>>63;
 821
 822 }
 823
 824 /*----------------------------------------------------------------------------
 825 | Normalizes the subnormal quadruple-precision floating-point value
 826 | represented by the denormalized significand formed by the concatenation of
 827 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 828 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 829 | significand are stored at the location pointed to by `zSig0Ptr', and the
 830 | least significant 64 bits of the normalized significand are stored at the
 831 | location pointed to by `zSig1Ptr'.
 832 *----------------------------------------------------------------------------*/
 833
 834 static void
 835  normalizeFloat128Subnormal(
 836      bits64 aSig0,
 837      bits64 aSig1,
 838      int32 *zExpPtr,
 839      bits64 *zSig0Ptr,
 840      bits64 *zSig1Ptr
 841  )
 842 {
 843     int8 shiftCount;
 844
 845     if ( aSig0 == 0 ) {
 846         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 847         if ( shiftCount < 0 ) {
 848             *zSig0Ptr = aSig1>>( - shiftCount );
 849             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 850         }
 851         else {
 852             *zSig0Ptr = aSig1<<shiftCount;
 853             *zSig1Ptr = 0;
 854         }
 855         *zExpPtr = - shiftCount - 63;
 856     }
 857     else {
 858         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 859         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 860         *zExpPtr = 1 - shiftCount;
 861     }
 862
 863 }
 864
 865 /*----------------------------------------------------------------------------
 866 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 867 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 868 | floating-point value, returning the result.  After being shifted into the
 869 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 870 | added together to form the most significant 32 bits of the result.  This
 871 | means that any integer portion of `zSig0' will be added into the exponent.
 872 | Since a properly normalized significand will have an integer portion equal
 873 | to 1, the `zExp' input should be 1 less than the desired result exponent
 874 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 875 | significand.
 876 *----------------------------------------------------------------------------*/
 877
 878 INLINE float128
 879  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 880 {
 881     float128 z;
 882
 883     z.low = zSig1;
 884     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 885     return z;
 886
 887 }
 888
 889 /*----------------------------------------------------------------------------
 890 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 891 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 892 | and `zSig2', and returns the proper quadruple-precision floating-point value
 893 | corresponding to the abstract input.  Ordinarily, the abstract value is
 894 | simply rounded and packed into the quadruple-precision format, with the
 895 | inexact exception raised if the abstract input cannot be represented
 896 | exactly.  However, if the abstract value is too large, the overflow and
 897 | inexact exceptions are raised and an infinity or maximal finite value is
 898 | returned.  If the abstract value is too small, the input value is rounded to
 899 | a subnormal number, and the underflow and inexact exceptions are raised if
 900 | the abstract input cannot be represented exactly as a subnormal quadruple-
 901 | precision floating-point number.
 902 |     The input significand must be normalized or smaller.  If the input
 903 | significand is not normalized, `zExp' must be 0; in that case, the result
 904 | returned is a subnormal number, and it must not require rounding.  In the
 905 | usual case that the input significand is normalized, `zExp' must be 1 less
 906 | than the ``true'' floating-point exponent.  The handling of underflow and
 907 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 908 *----------------------------------------------------------------------------*/
 909
 910 static float128
 911  roundAndPackFloat128(
 912      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 913 {
 914     int8 roundingMode;
 915     flag roundNearestEven, increment, isTiny;
 916
 917     roundingMode = STATUS(float_rounding_mode);
 918     roundNearestEven = ( roundingMode == float_round_nearest_even );
 919     increment = ( (sbits64) zSig2 < 0 );
 920     if ( ! roundNearestEven ) {
 921         if ( roundingMode == float_round_to_zero ) {
 922             increment = 0;
 923         }
 924         else {
 925             if ( zSign ) {
 926                 increment = ( roundingMode == float_round_down ) && zSig2;
 927             }
 928             else {
 929                 increment = ( roundingMode == float_round_up ) && zSig2;
 930             }
 931         }
 932     }
 933     if ( 0x7FFD <= (bits32) zExp ) {
 934         if (    ( 0x7FFD < zExp )
 935              || (    ( zExp == 0x7FFD )
 936                   && eq128(
 937                          LIT64( 0x0001FFFFFFFFFFFF ),
 938                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 939                          zSig0,
 940                          zSig1
 941                      )
 942                   && increment
 943                 )
 944            ) {
 945             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 946             if (    ( roundingMode == float_round_to_zero )
 947                  || ( zSign && ( roundingMode == float_round_up ) )
 948                  || ( ! zSign && ( roundingMode == float_round_down ) )
 949                ) {
 950                 return
 951                     packFloat128(
 952                         zSign,
 953                         0x7FFE,
 954                         LIT64( 0x0000FFFFFFFFFFFF ),
 955                         LIT64( 0xFFFFFFFFFFFFFFFF )
 956                     );
 957             }
 958             return packFloat128( zSign, 0x7FFF, 0, 0 );
 959         }
 960         if ( zExp < 0 ) {
 961             isTiny =
 962                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 963                 || ( zExp < -1 )
 964                 || ! increment
 965                 || lt128(
 966                        zSig0,
 967                        zSig1,
 968                        LIT64( 0x0001FFFFFFFFFFFF ),
 969                        LIT64( 0xFFFFFFFFFFFFFFFF )
 970                    );
 971             shift128ExtraRightJamming(
 972                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 973             zExp = 0;
 974             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 975             if ( roundNearestEven ) {
 976                 increment = ( (sbits64) zSig2 < 0 );
 977             }
 978             else {
 979                 if ( zSign ) {
 980                     increment = ( roundingMode == float_round_down ) && zSig2;
 981                 }
 982                 else {
 983                     increment = ( roundingMode == float_round_up ) && zSig2;
 984                 }
 985             }
 986         }
 987     }
 988     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
 989     if ( increment ) {
 990         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 991         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 992     }
 993     else {
 994         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 995     }
 996     return packFloat128( zSign, zExp, zSig0, zSig1 );
 997
 998 }
 999
1000 /*----------------------------------------------------------------------------
1001 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1002 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1003 | returns the proper quadruple-precision floating-point value corresponding
1004 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1005 | except that the input significand has fewer bits and does not have to be
1006 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1007 | point exponent.
1008 *----------------------------------------------------------------------------*/
1009
1010 static float128
1011  normalizeRoundAndPackFloat128(
1012      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1013 {
1014     int8 shiftCount;
1015     bits64 zSig2;
1016
1017     if ( zSig0 == 0 ) {
1018         zSig0 = zSig1;
1019         zSig1 = 0;
1020         zExp -= 64;
1021     }
1022     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1023     if ( 0 <= shiftCount ) {
1024         zSig2 = 0;
1025         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1026     }
1027     else {
1028         shift128ExtraRightJamming(
1029             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1030     }
1031     zExp -= shiftCount;
1032     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1033
1034 }
1035
1036 #endif
1037
1038 /*----------------------------------------------------------------------------
1039 | Returns the result of converting the 32-bit two's complement integer `a'
1040 | to the single-precision floating-point format.  The conversion is performed
1041 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1042 *----------------------------------------------------------------------------*/
1043
1044 float32 int32_to_float32( int32 a STATUS_PARAM )
1045 {
1046     flag zSign;
1047
1048     if ( a == 0 ) return 0;
1049     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1050     zSign = ( a < 0 );
1051     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1052
1053 }
1054
1055 /*----------------------------------------------------------------------------
1056 | Returns the result of converting the 32-bit two's complement integer `a'
1057 | to the double-precision floating-point format.  The conversion is performed
1058 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1059 *----------------------------------------------------------------------------*/
1060
1061 float64 int32_to_float64( int32 a STATUS_PARAM )
1062 {
1063     flag zSign;
1064     uint32 absA;
1065     int8 shiftCount;
1066     bits64 zSig;
1067
1068     if ( a == 0 ) return 0;
1069     zSign = ( a < 0 );
1070     absA = zSign ? - a : a;
1071     shiftCount = countLeadingZeros32( absA ) + 21;
1072     zSig = absA;
1073     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1074
1075 }
1076
1077 #ifdef FLOATX80
1078
1079 /*----------------------------------------------------------------------------
1080 | Returns the result of converting the 32-bit two's complement integer `a'
1081 | to the extended double-precision floating-point format.  The conversion
1082 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1083 | Arithmetic.
1084 *----------------------------------------------------------------------------*/
1085
1086 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1087 {
1088     flag zSign;
1089     uint32 absA;
1090     int8 shiftCount;
1091     bits64 zSig;
1092
1093     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1094     zSign = ( a < 0 );
1095     absA = zSign ? - a : a;
1096     shiftCount = countLeadingZeros32( absA ) + 32;
1097     zSig = absA;
1098     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1099
1100 }
1101
1102 #endif
1103
1104 #ifdef FLOAT128
1105
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a' to
1108 | the quadruple-precision floating-point format.  The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1111
1112 float128 int32_to_float128( int32 a STATUS_PARAM )
1113 {
1114     flag zSign;
1115     uint32 absA;
1116     int8 shiftCount;
1117     bits64 zSig0;
1118
1119     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1120     zSign = ( a < 0 );
1121     absA = zSign ? - a : a;
1122     shiftCount = countLeadingZeros32( absA ) + 17;
1123     zSig0 = absA;
1124     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1125
1126 }
1127
1128 #endif
1129
1130 /*----------------------------------------------------------------------------
1131 | Returns the result of converting the 64-bit two's complement integer `a'
1132 | to the single-precision floating-point format.  The conversion is performed
1133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 *----------------------------------------------------------------------------*/
1135
1136 float32 int64_to_float32( int64 a STATUS_PARAM )
1137 {
1138     flag zSign;
1139     uint64 absA;
1140     int8 shiftCount;
1141
1142     if ( a == 0 ) return 0;
1143     zSign = ( a < 0 );
1144     absA = zSign ? - a : a;
1145     shiftCount = countLeadingZeros64( absA ) - 40;
1146     if ( 0 <= shiftCount ) {
1147         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1148     }
1149     else {
1150         shiftCount += 7;
1151         if ( shiftCount < 0 ) {
1152             shift64RightJamming( absA, - shiftCount, &absA );
1153         }
1154         else {
1155             absA <<= shiftCount;
1156         }
1157         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1158     }
1159
1160 }
1161
1162 /*----------------------------------------------------------------------------
1163 | Returns the result of converting the 64-bit two's complement integer `a'
1164 | to the double-precision floating-point format.  The conversion is performed
1165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166 *----------------------------------------------------------------------------*/
1167
1168 float64 int64_to_float64( int64 a STATUS_PARAM )
1169 {
1170     flag zSign;
1171
1172     if ( a == 0 ) return 0;
1173     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1174         return packFloat64( 1, 0x43E, 0 );
1175     }
1176     zSign = ( a < 0 );
1177     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1178
1179 }
1180
1181 #ifdef FLOATX80
1182
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 64-bit two's complement integer `a'
1185 | to the extended double-precision floating-point format.  The conversion
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 | Arithmetic.
1188 *----------------------------------------------------------------------------*/
1189
1190 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1191 {
1192     flag zSign;
1193     uint64 absA;
1194     int8 shiftCount;
1195
1196     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1197     zSign = ( a < 0 );
1198     absA = zSign ? - a : a;
1199     shiftCount = countLeadingZeros64( absA );
1200     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1201
1202 }
1203
1204 #endif
1205
1206 #ifdef FLOAT128
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a' to
1210 | the quadruple-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float128 int64_to_float128( int64 a STATUS_PARAM )
1215 {
1216     flag zSign;
1217     uint64 absA;
1218     int8 shiftCount;
1219     int32 zExp;
1220     bits64 zSig0, zSig1;
1221
1222     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1223     zSign = ( a < 0 );
1224     absA = zSign ? - a : a;
1225     shiftCount = countLeadingZeros64( absA ) + 49;
1226     zExp = 0x406E - shiftCount;
1227     if ( 64 <= shiftCount ) {
1228         zSig1 = 0;
1229         zSig0 = absA;
1230         shiftCount -= 64;
1231     }
1232     else {
1233         zSig1 = absA;
1234         zSig0 = 0;
1235     }
1236     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1237     return packFloat128( zSign, zExp, zSig0, zSig1 );
1238
1239 }
1240
1241 #endif
1242
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the single-precision floating-point value
1245 | `a' to the 32-bit two's complement integer format.  The conversion is
1246 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1247 | Arithmetic---which means in particular that the conversion is rounded
1248 | according to the current rounding mode.  If `a' is a NaN, the largest
1249 | positive integer is returned.  Otherwise, if the conversion overflows, the
1250 | largest integer with the same sign as `a' is returned.
1251 *----------------------------------------------------------------------------*/
1252
1253 int32 float32_to_int32( float32 a STATUS_PARAM )
1254 {
1255     flag aSign;
1256     int16 aExp, shiftCount;
1257     bits32 aSig;
1258     bits64 aSig64;
1259
1260     aSig = extractFloat32Frac( a );
1261     aExp = extractFloat32Exp( a );
1262     aSign = extractFloat32Sign( a );
1263     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1264     if ( aExp ) aSig |= 0x00800000;
1265     shiftCount = 0xAF - aExp;
1266     aSig64 = aSig;
1267     aSig64 <<= 32;
1268     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1269     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1270
1271 }
1272
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the single-precision floating-point value
1275 | `a' to the 32-bit two's complement integer format.  The conversion is
1276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 | Arithmetic, except that the conversion is always rounded toward zero.
1278 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1279 | the conversion overflows, the largest integer with the same sign as `a' is
1280 | returned.
1281 *----------------------------------------------------------------------------*/
1282
1283 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1284 {
1285     flag aSign;
1286     int16 aExp, shiftCount;
1287     bits32 aSig;
1288     int32 z;
1289
1290     aSig = extractFloat32Frac( a );
1291     aExp = extractFloat32Exp( a );
1292     aSign = extractFloat32Sign( a );
1293     shiftCount = aExp - 0x9E;
1294     if ( 0 <= shiftCount ) {
1295         if ( a != 0xCF000000 ) {
1296             float_raise( float_flag_invalid STATUS_VAR);
1297             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1298         }
1299         return (sbits32) 0x80000000;
1300     }
1301     else if ( aExp <= 0x7E ) {
1302         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1303         return 0;
1304     }
1305     aSig = ( aSig | 0x00800000 )<<8;
1306     z = aSig>>( - shiftCount );
1307     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1308         STATUS(float_exception_flags) |= float_flag_inexact;
1309     }
1310     if ( aSign ) z = - z;
1311     return z;
1312
1313 }
1314
1315 /*----------------------------------------------------------------------------
1316 | Returns the result of converting the single-precision floating-point value
1317 | `a' to the 64-bit two's complement integer format.  The conversion is
1318 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 | Arithmetic---which means in particular that the conversion is rounded
1320 | according to the current rounding mode.  If `a' is a NaN, the largest
1321 | positive integer is returned.  Otherwise, if the conversion overflows, the
1322 | largest integer with the same sign as `a' is returned.
1323 *----------------------------------------------------------------------------*/
1324
1325 int64 float32_to_int64( float32 a STATUS_PARAM )
1326 {
1327     flag aSign;
1328     int16 aExp, shiftCount;
1329     bits32 aSig;
1330     bits64 aSig64, aSigExtra;
1331
1332     aSig = extractFloat32Frac( a );
1333     aExp = extractFloat32Exp( a );
1334     aSign = extractFloat32Sign( a );
1335     shiftCount = 0xBE - aExp;
1336     if ( shiftCount < 0 ) {
1337         float_raise( float_flag_invalid STATUS_VAR);
1338         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1339             return LIT64( 0x7FFFFFFFFFFFFFFF );
1340         }
1341         return (sbits64) LIT64( 0x8000000000000000 );
1342     }
1343     if ( aExp ) aSig |= 0x00800000;
1344     aSig64 = aSig;
1345     aSig64 <<= 40;
1346     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1347     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1348
1349 }
1350
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the single-precision floating-point value
1353 | `a' to the 64-bit two's complement integer format.  The conversion is
1354 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1355 | Arithmetic, except that the conversion is always rounded toward zero.  If
1356 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1357 | conversion overflows, the largest integer with the same sign as `a' is
1358 | returned.
1359 *----------------------------------------------------------------------------*/
1360
1361 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1362 {
1363     flag aSign;
1364     int16 aExp, shiftCount;
1365     bits32 aSig;
1366     bits64 aSig64;
1367     int64 z;
1368
1369     aSig = extractFloat32Frac( a );
1370     aExp = extractFloat32Exp( a );
1371     aSign = extractFloat32Sign( a );
1372     shiftCount = aExp - 0xBE;
1373     if ( 0 <= shiftCount ) {
1374         if ( a != 0xDF000000 ) {
1375             float_raise( float_flag_invalid STATUS_VAR);
1376             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1377                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1378             }
1379         }
1380         return (sbits64) LIT64( 0x8000000000000000 );
1381     }
1382     else if ( aExp <= 0x7E ) {
1383         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1384         return 0;
1385     }
1386     aSig64 = aSig | 0x00800000;
1387     aSig64 <<= 40;
1388     z = aSig64>>( - shiftCount );
1389     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1390         STATUS(float_exception_flags) |= float_flag_inexact;
1391     }
1392     if ( aSign ) z = - z;
1393     return z;
1394
1395 }
1396
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the double-precision floating-point format.  The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 | Arithmetic.
1402 *----------------------------------------------------------------------------*/
1403
1404 float64 float32_to_float64( float32 a STATUS_PARAM )
1405 {
1406     flag aSign;
1407     int16 aExp;
1408     bits32 aSig;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     if ( aExp == 0xFF ) {
1414         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1415         return packFloat64( aSign, 0x7FF, 0 );
1416     }
1417     if ( aExp == 0 ) {
1418         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1419         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1420         --aExp;
1421     }
1422     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1423
1424 }
1425
1426 #ifdef FLOATX80
1427
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the single-precision floating-point value
1430 | `a' to the extended double-precision floating-point format.  The conversion
1431 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1432 | Arithmetic.
1433 *----------------------------------------------------------------------------*/
1434
1435 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1436 {
1437     flag aSign;
1438     int16 aExp;
1439     bits32 aSig;
1440
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     if ( aExp == 0xFF ) {
1445         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1446         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1447     }
1448     if ( aExp == 0 ) {
1449         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1450         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1451     }
1452     aSig |= 0x00800000;
1453     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1454
1455 }
1456
1457 #endif
1458
1459 #ifdef FLOAT128
1460
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the double-precision floating-point format.  The conversion is
1464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1465 | Arithmetic.
1466 *----------------------------------------------------------------------------*/
1467
1468 float128 float32_to_float128( float32 a STATUS_PARAM )
1469 {
1470     flag aSign;
1471     int16 aExp;
1472     bits32 aSig;
1473
1474     aSig = extractFloat32Frac( a );
1475     aExp = extractFloat32Exp( a );
1476     aSign = extractFloat32Sign( a );
1477     if ( aExp == 0xFF ) {
1478         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1479         return packFloat128( aSign, 0x7FFF, 0, 0 );
1480     }
1481     if ( aExp == 0 ) {
1482         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1483         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484         --aExp;
1485     }
1486     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1487
1488 }
1489
1490 #endif
1491
1492 /*----------------------------------------------------------------------------
1493 | Rounds the single-precision floating-point value `a' to an integer, and
1494 | returns the result as a single-precision floating-point value.  The
1495 | operation is performed according to the IEC/IEEE Standard for Binary
1496 | Floating-Point Arithmetic.
1497 *----------------------------------------------------------------------------*/
1498
1499 float32 float32_round_to_int( float32 a STATUS_PARAM)
1500 {
1501     flag aSign;
1502     int16 aExp;
1503     bits32 lastBitMask, roundBitsMask;
1504     int8 roundingMode;
1505     float32 z;
1506
1507     aExp = extractFloat32Exp( a );
1508     if ( 0x96 <= aExp ) {
1509         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1510             return propagateFloat32NaN( a, a STATUS_VAR );
1511         }
1512         return a;
1513     }
1514     if ( aExp <= 0x7E ) {
1515         if ( (bits32) ( a<<1 ) == 0 ) return a;
1516         STATUS(float_exception_flags) |= float_flag_inexact;
1517         aSign = extractFloat32Sign( a );
1518         switch ( STATUS(float_rounding_mode) ) {
1519          case float_round_nearest_even:
1520             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1521                 return packFloat32( aSign, 0x7F, 0 );
1522             }
1523             break;
1524          case float_round_down:
1525             return aSign ? 0xBF800000 : 0;
1526          case float_round_up:
1527             return aSign ? 0x80000000 : 0x3F800000;
1528         }
1529         return packFloat32( aSign, 0, 0 );
1530     }
1531     lastBitMask = 1;
1532     lastBitMask <<= 0x96 - aExp;
1533     roundBitsMask = lastBitMask - 1;
1534     z = a;
1535     roundingMode = STATUS(float_rounding_mode);
1536     if ( roundingMode == float_round_nearest_even ) {
1537         z += lastBitMask>>1;
1538         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1539     }
1540     else if ( roundingMode != float_round_to_zero ) {
1541         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1542             z += roundBitsMask;
1543         }
1544     }
1545     z &= ~ roundBitsMask;
1546     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of adding the absolute values of the single-precision
1553 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1554 | before being returned.  `zSign' is ignored if the result is a NaN.
1555 | The addition is performed according to the IEC/IEEE Standard for Binary
1556 | Floating-Point Arithmetic.
1557 *----------------------------------------------------------------------------*/
1558
1559 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1560 {
1561     int16 aExp, bExp, zExp;
1562     bits32 aSig, bSig, zSig;
1563     int16 expDiff;
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     bSig = extractFloat32Frac( b );
1568     bExp = extractFloat32Exp( b );
1569     expDiff = aExp - bExp;
1570     aSig <<= 6;
1571     bSig <<= 6;
1572     if ( 0 < expDiff ) {
1573         if ( aExp == 0xFF ) {
1574             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1575             return a;
1576         }
1577         if ( bExp == 0 ) {
1578             --expDiff;
1579         }
1580         else {
1581             bSig |= 0x20000000;
1582         }
1583         shift32RightJamming( bSig, expDiff, &bSig );
1584         zExp = aExp;
1585     }
1586     else if ( expDiff < 0 ) {
1587         if ( bExp == 0xFF ) {
1588             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1589             return packFloat32( zSign, 0xFF, 0 );
1590         }
1591         if ( aExp == 0 ) {
1592             ++expDiff;
1593         }
1594         else {
1595             aSig |= 0x20000000;
1596         }
1597         shift32RightJamming( aSig, - expDiff, &aSig );
1598         zExp = bExp;
1599     }
1600     else {
1601         if ( aExp == 0xFF ) {
1602             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1603             return a;
1604         }
1605         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1606         zSig = 0x40000000 + aSig + bSig;
1607         zExp = aExp;
1608         goto roundAndPack;
1609     }
1610     aSig |= 0x20000000;
1611     zSig = ( aSig + bSig )<<1;
1612     --zExp;
1613     if ( (sbits32) zSig < 0 ) {
1614         zSig = aSig + bSig;
1615         ++zExp;
1616     }
1617  roundAndPack:
1618     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1619
1620 }
1621
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of subtracting the absolute values of the single-
1624 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1625 | difference is negated before being returned.  `zSign' is ignored if the
1626 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1627 | Standard for Binary Floating-Point Arithmetic.
1628 *----------------------------------------------------------------------------*/
1629
1630 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1631 {
1632     int16 aExp, bExp, zExp;
1633     bits32 aSig, bSig, zSig;
1634     int16 expDiff;
1635
1636     aSig = extractFloat32Frac( a );
1637     aExp = extractFloat32Exp( a );
1638     bSig = extractFloat32Frac( b );
1639     bExp = extractFloat32Exp( b );
1640     expDiff = aExp - bExp;
1641     aSig <<= 7;
1642     bSig <<= 7;
1643     if ( 0 < expDiff ) goto aExpBigger;
1644     if ( expDiff < 0 ) goto bExpBigger;
1645     if ( aExp == 0xFF ) {
1646         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1647         float_raise( float_flag_invalid STATUS_VAR);
1648         return float32_default_nan;
1649     }
1650     if ( aExp == 0 ) {
1651         aExp = 1;
1652         bExp = 1;
1653     }
1654     if ( bSig < aSig ) goto aBigger;
1655     if ( aSig < bSig ) goto bBigger;
1656     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1657  bExpBigger:
1658     if ( bExp == 0xFF ) {
1659         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1660         return packFloat32( zSign ^ 1, 0xFF, 0 );
1661     }
1662     if ( aExp == 0 ) {
1663         ++expDiff;
1664     }
1665     else {
1666         aSig |= 0x40000000;
1667     }
1668     shift32RightJamming( aSig, - expDiff, &aSig );
1669     bSig |= 0x40000000;
1670  bBigger:
1671     zSig = bSig - aSig;
1672     zExp = bExp;
1673     zSign ^= 1;
1674     goto normalizeRoundAndPack;
1675  aExpBigger:
1676     if ( aExp == 0xFF ) {
1677         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1678         return a;
1679     }
1680     if ( bExp == 0 ) {
1681         --expDiff;
1682     }
1683     else {
1684         bSig |= 0x40000000;
1685     }
1686     shift32RightJamming( bSig, expDiff, &bSig );
1687     aSig |= 0x40000000;
1688  aBigger:
1689     zSig = aSig - bSig;
1690     zExp = aExp;
1691  normalizeRoundAndPack:
1692     --zExp;
1693     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1694
1695 }
1696
1697 /*----------------------------------------------------------------------------
1698 | Returns the result of adding the single-precision floating-point values `a'
1699 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1700 | Binary Floating-Point Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702
1703 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1704 {
1705     flag aSign, bSign;
1706
1707     aSign = extractFloat32Sign( a );
1708     bSign = extractFloat32Sign( b );
1709     if ( aSign == bSign ) {
1710         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1711     }
1712     else {
1713         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1714     }
1715
1716 }
1717
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of subtracting the single-precision floating-point values
1720 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1721 | for Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1723
1724 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1725 {
1726     flag aSign, bSign;
1727
1728     aSign = extractFloat32Sign( a );
1729     bSign = extractFloat32Sign( b );
1730     if ( aSign == bSign ) {
1731         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1732     }
1733     else {
1734         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1735     }
1736
1737 }
1738
1739 /*----------------------------------------------------------------------------
1740 | Returns the result of multiplying the single-precision floating-point values
1741 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1742 | for Binary Floating-Point Arithmetic.
1743 *----------------------------------------------------------------------------*/
1744
1745 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1746 {
1747     flag aSign, bSign, zSign;
1748     int16 aExp, bExp, zExp;
1749     bits32 aSig, bSig;
1750     bits64 zSig64;
1751     bits32 zSig;
1752
1753     aSig = extractFloat32Frac( a );
1754     aExp = extractFloat32Exp( a );
1755     aSign = extractFloat32Sign( a );
1756     bSig = extractFloat32Frac( b );
1757     bExp = extractFloat32Exp( b );
1758     bSign = extractFloat32Sign( b );
1759     zSign = aSign ^ bSign;
1760     if ( aExp == 0xFF ) {
1761         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1762             return propagateFloat32NaN( a, b STATUS_VAR );
1763         }
1764         if ( ( bExp | bSig ) == 0 ) {
1765             float_raise( float_flag_invalid STATUS_VAR);
1766             return float32_default_nan;
1767         }
1768         return packFloat32( zSign, 0xFF, 0 );
1769     }
1770     if ( bExp == 0xFF ) {
1771         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1772         if ( ( aExp | aSig ) == 0 ) {
1773             float_raise( float_flag_invalid STATUS_VAR);
1774             return float32_default_nan;
1775         }
1776         return packFloat32( zSign, 0xFF, 0 );
1777     }
1778     if ( aExp == 0 ) {
1779         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1780         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1781     }
1782     if ( bExp == 0 ) {
1783         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1784         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1785     }
1786     zExp = aExp + bExp - 0x7F;
1787     aSig = ( aSig | 0x00800000 )<<7;
1788     bSig = ( bSig | 0x00800000 )<<8;
1789     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1790     zSig = zSig64;
1791     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1792         zSig <<= 1;
1793         --zExp;
1794     }
1795     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1796
1797 }
1798
1799 /*----------------------------------------------------------------------------
1800 | Returns the result of dividing the single-precision floating-point value `a'
1801 | by the corresponding value `b'.  The operation is performed according to the
1802 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803 *----------------------------------------------------------------------------*/
1804
1805 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1806 {
1807     flag aSign, bSign, zSign;
1808     int16 aExp, bExp, zExp;
1809     bits32 aSig, bSig, zSig;
1810
1811     aSig = extractFloat32Frac( a );
1812     aExp = extractFloat32Exp( a );
1813     aSign = extractFloat32Sign( a );
1814     bSig = extractFloat32Frac( b );
1815     bExp = extractFloat32Exp( b );
1816     bSign = extractFloat32Sign( b );
1817     zSign = aSign ^ bSign;
1818     if ( aExp == 0xFF ) {
1819         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1820         if ( bExp == 0xFF ) {
1821             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822             float_raise( float_flag_invalid STATUS_VAR);
1823             return float32_default_nan;
1824         }
1825         return packFloat32( zSign, 0xFF, 0 );
1826     }
1827     if ( bExp == 0xFF ) {
1828         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1829         return packFloat32( zSign, 0, 0 );
1830     }
1831     if ( bExp == 0 ) {
1832         if ( bSig == 0 ) {
1833             if ( ( aExp | aSig ) == 0 ) {
1834                 float_raise( float_flag_invalid STATUS_VAR);
1835                 return float32_default_nan;
1836             }
1837             float_raise( float_flag_divbyzero STATUS_VAR);
1838             return packFloat32( zSign, 0xFF, 0 );
1839         }
1840         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1841     }
1842     if ( aExp == 0 ) {
1843         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1844         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1845     }
1846     zExp = aExp - bExp + 0x7D;
1847     aSig = ( aSig | 0x00800000 )<<7;
1848     bSig = ( bSig | 0x00800000 )<<8;
1849     if ( bSig <= ( aSig + aSig ) ) {
1850         aSig >>= 1;
1851         ++zExp;
1852     }
1853     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1854     if ( ( zSig & 0x3F ) == 0 ) {
1855         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1856     }
1857     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859 }
1860
1861 /*----------------------------------------------------------------------------
1862 | Returns the remainder of the single-precision floating-point value `a'
1863 | with respect to the corresponding value `b'.  The operation is performed
1864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1868 {
1869     flag aSign, bSign, zSign;
1870     int16 aExp, bExp, expDiff;
1871     bits32 aSig, bSig;
1872     bits32 q;
1873     bits64 aSig64, bSig64, q64;
1874     bits32 alternateASig;
1875     sbits32 sigMean;
1876
1877     aSig = extractFloat32Frac( a );
1878     aExp = extractFloat32Exp( a );
1879     aSign = extractFloat32Sign( a );
1880     bSig = extractFloat32Frac( b );
1881     bExp = extractFloat32Exp( b );
1882     bSign = extractFloat32Sign( b );
1883     if ( aExp == 0xFF ) {
1884         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1885             return propagateFloat32NaN( a, b STATUS_VAR );
1886         }
1887         float_raise( float_flag_invalid STATUS_VAR);
1888         return float32_default_nan;
1889     }
1890     if ( bExp == 0xFF ) {
1891         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892         return a;
1893     }
1894     if ( bExp == 0 ) {
1895         if ( bSig == 0 ) {
1896             float_raise( float_flag_invalid STATUS_VAR);
1897             return float32_default_nan;
1898         }
1899         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1900     }
1901     if ( aExp == 0 ) {
1902         if ( aSig == 0 ) return a;
1903         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1904     }
1905     expDiff = aExp - bExp;
1906     aSig |= 0x00800000;
1907     bSig |= 0x00800000;
1908     if ( expDiff < 32 ) {
1909         aSig <<= 8;
1910         bSig <<= 8;
1911         if ( expDiff < 0 ) {
1912             if ( expDiff < -1 ) return a;
1913             aSig >>= 1;
1914         }
1915         q = ( bSig <= aSig );
1916         if ( q ) aSig -= bSig;
1917         if ( 0 < expDiff ) {
1918             q = ( ( (bits64) aSig )<<32 ) / bSig;
1919             q >>= 32 - expDiff;
1920             bSig >>= 2;
1921             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1922         }
1923         else {
1924             aSig >>= 2;
1925             bSig >>= 2;
1926         }
1927     }
1928     else {
1929         if ( bSig <= aSig ) aSig -= bSig;
1930         aSig64 = ( (bits64) aSig )<<40;
1931         bSig64 = ( (bits64) bSig )<<40;
1932         expDiff -= 64;
1933         while ( 0 < expDiff ) {
1934             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1935             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1936             aSig64 = - ( ( bSig * q64 )<<38 );
1937             expDiff -= 62;
1938         }
1939         expDiff += 64;
1940         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1941         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1942         q = q64>>( 64 - expDiff );
1943         bSig <<= 6;
1944         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1945     }
1946     do {
1947         alternateASig = aSig;
1948         ++q;
1949         aSig -= bSig;
1950     } while ( 0 <= (sbits32) aSig );
1951     sigMean = aSig + alternateASig;
1952     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1953         aSig = alternateASig;
1954     }
1955     zSign = ( (sbits32) aSig < 0 );
1956     if ( zSign ) aSig = - aSig;
1957     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1958
1959 }
1960
1961 /*----------------------------------------------------------------------------
1962 | Returns the square root of the single-precision floating-point value `a'.
1963 | The operation is performed according to the IEC/IEEE Standard for Binary
1964 | Floating-Point Arithmetic.
1965 *----------------------------------------------------------------------------*/
1966
1967 float32 float32_sqrt( float32 a STATUS_PARAM )
1968 {
1969     flag aSign;
1970     int16 aExp, zExp;
1971     bits32 aSig, zSig;
1972     bits64 rem, term;
1973
1974     aSig = extractFloat32Frac( a );
1975     aExp = extractFloat32Exp( a );
1976     aSign = extractFloat32Sign( a );
1977     if ( aExp == 0xFF ) {
1978         if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1979         if ( ! aSign ) return a;
1980         float_raise( float_flag_invalid STATUS_VAR);
1981         return float32_default_nan;
1982     }
1983     if ( aSign ) {
1984         if ( ( aExp | aSig ) == 0 ) return a;
1985         float_raise( float_flag_invalid STATUS_VAR);
1986         return float32_default_nan;
1987     }
1988     if ( aExp == 0 ) {
1989         if ( aSig == 0 ) return 0;
1990         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1991     }
1992     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1993     aSig = ( aSig | 0x00800000 )<<8;
1994     zSig = estimateSqrt32( aExp, aSig ) + 2;
1995     if ( ( zSig & 0x7F ) <= 5 ) {
1996         if ( zSig < 2 ) {
1997             zSig = 0x7FFFFFFF;
1998             goto roundAndPack;
1999         }
2000         aSig >>= aExp & 1;
2001         term = ( (bits64) zSig ) * zSig;
2002         rem = ( ( (bits64) aSig )<<32 ) - term;
2003         while ( (sbits64) rem < 0 ) {
2004             --zSig;
2005             rem += ( ( (bits64) zSig )<<1 ) | 1;
2006         }
2007         zSig |= ( rem != 0 );
2008     }
2009     shift32RightJamming( zSig, 1, &zSig );
2010  roundAndPack:
2011     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2012
2013 }
2014
2015 /*----------------------------------------------------------------------------
2016 | Returns 1 if the single-precision floating-point value `a' is equal to
2017 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 *----------------------------------------------------------------------------*/
2020
2021 flag float32_eq( float32 a, float32 b STATUS_PARAM )
2022 {
2023
2024     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2025          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2026        ) {
2027         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2028             float_raise( float_flag_invalid STATUS_VAR);
2029         }
2030         return 0;
2031     }
2032     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2033
2034 }
2035
2036 /*----------------------------------------------------------------------------
2037 | Returns 1 if the single-precision floating-point value `a' is less than
2038 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040 | Arithmetic.
2041 *----------------------------------------------------------------------------*/
2042
2043 flag float32_le( float32 a, float32 b STATUS_PARAM )
2044 {
2045     flag aSign, bSign;
2046
2047     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2048          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2049        ) {
2050         float_raise( float_flag_invalid STATUS_VAR);
2051         return 0;
2052     }
2053     aSign = extractFloat32Sign( a );
2054     bSign = extractFloat32Sign( b );
2055     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2056     return ( a == b ) || ( aSign ^ ( a < b ) );
2057
2058 }
2059
2060 /*----------------------------------------------------------------------------
2061 | Returns 1 if the single-precision floating-point value `a' is less than
2062 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064 *----------------------------------------------------------------------------*/
2065
2066 flag float32_lt( float32 a, float32 b STATUS_PARAM )
2067 {
2068     flag aSign, bSign;
2069
2070     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2071          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2072        ) {
2073         float_raise( float_flag_invalid STATUS_VAR);
2074         return 0;
2075     }
2076     aSign = extractFloat32Sign( a );
2077     bSign = extractFloat32Sign( b );
2078     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2079     return ( a != b ) && ( aSign ^ ( a < b ) );
2080
2081 }
2082
2083 /*----------------------------------------------------------------------------
2084 | Returns 1 if the single-precision floating-point value `a' is equal to
2085 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2086 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088 *----------------------------------------------------------------------------*/
2089
2090 flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2091 {
2092
2093     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2095        ) {
2096         float_raise( float_flag_invalid STATUS_VAR);
2097         return 0;
2098     }
2099     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2100
2101 }
2102
2103 /*----------------------------------------------------------------------------
2104 | Returns 1 if the single-precision floating-point value `a' is less than or
2105 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2106 | cause an exception.  Otherwise, the comparison is performed according to the
2107 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108 *----------------------------------------------------------------------------*/
2109
2110 flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2111 {
2112     flag aSign, bSign;
2113
2114     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116        ) {
2117         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118             float_raise( float_flag_invalid STATUS_VAR);
2119         }
2120         return 0;
2121     }
2122     aSign = extractFloat32Sign( a );
2123     bSign = extractFloat32Sign( b );
2124     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2125     return ( a == b ) || ( aSign ^ ( a < b ) );
2126
2127 }
2128
2129 /*----------------------------------------------------------------------------
2130 | Returns 1 if the single-precision floating-point value `a' is less than
2131 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2132 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2133 | Standard for Binary Floating-Point Arithmetic.
2134 *----------------------------------------------------------------------------*/
2135
2136 flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2137 {
2138     flag aSign, bSign;
2139
2140     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142        ) {
2143         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2144             float_raise( float_flag_invalid STATUS_VAR);
2145         }
2146         return 0;
2147     }
2148     aSign = extractFloat32Sign( a );
2149     bSign = extractFloat32Sign( b );
2150     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2151     return ( a != b ) && ( aSign ^ ( a < b ) );
2152
2153 }
2154
2155 /*----------------------------------------------------------------------------
2156 | Returns the result of converting the double-precision floating-point value
2157 | `a' to the 32-bit two's complement integer format.  The conversion is
2158 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2159 | Arithmetic---which means in particular that the conversion is rounded
2160 | according to the current rounding mode.  If `a' is a NaN, the largest
2161 | positive integer is returned.  Otherwise, if the conversion overflows, the
2162 | largest integer with the same sign as `a' is returned.
2163 *----------------------------------------------------------------------------*/
2164
2165 int32 float64_to_int32( float64 a STATUS_PARAM )
2166 {
2167     flag aSign;
2168     int16 aExp, shiftCount;
2169     bits64 aSig;
2170
2171     aSig = extractFloat64Frac( a );
2172     aExp = extractFloat64Exp( a );
2173     aSign = extractFloat64Sign( a );
2174     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2175     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2176     shiftCount = 0x42C - aExp;
2177     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2178     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2179
2180 }
2181
2182 /*----------------------------------------------------------------------------
2183 | Returns the result of converting the double-precision floating-point value
2184 | `a' to the 32-bit two's complement integer format.  The conversion is
2185 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2186 | Arithmetic, except that the conversion is always rounded toward zero.
2187 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2188 | the conversion overflows, the largest integer with the same sign as `a' is
2189 | returned.
2190 *----------------------------------------------------------------------------*/
2191
2192 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2193 {
2194     flag aSign;
2195     int16 aExp, shiftCount;
2196     bits64 aSig, savedASig;
2197     int32 z;
2198
2199     aSig = extractFloat64Frac( a );
2200     aExp = extractFloat64Exp( a );
2201     aSign = extractFloat64Sign( a );
2202     if ( 0x41E < aExp ) {
2203         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2204         goto invalid;
2205     }
2206     else if ( aExp < 0x3FF ) {
2207         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2208         return 0;
2209     }
2210     aSig |= LIT64( 0x0010000000000000 );
2211     shiftCount = 0x433 - aExp;
2212     savedASig = aSig;
2213     aSig >>= shiftCount;
2214     z = aSig;
2215     if ( aSign ) z = - z;
2216     if ( ( z < 0 ) ^ aSign ) {
2217  invalid:
2218         float_raise( float_flag_invalid STATUS_VAR);
2219         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2220     }
2221     if ( ( aSig<<shiftCount ) != savedASig ) {
2222         STATUS(float_exception_flags) |= float_flag_inexact;
2223     }
2224     return z;
2225
2226 }
2227
2228 /*----------------------------------------------------------------------------
2229 | Returns the result of converting the double-precision floating-point value
2230 | `a' to the 64-bit two's complement integer format.  The conversion is
2231 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2232 | Arithmetic---which means in particular that the conversion is rounded
2233 | according to the current rounding mode.  If `a' is a NaN, the largest
2234 | positive integer is returned.  Otherwise, if the conversion overflows, the
2235 | largest integer with the same sign as `a' is returned.
2236 *----------------------------------------------------------------------------*/
2237
2238 int64 float64_to_int64( float64 a STATUS_PARAM )
2239 {
2240     flag aSign;
2241     int16 aExp, shiftCount;
2242     bits64 aSig, aSigExtra;
2243
2244     aSig = extractFloat64Frac( a );
2245     aExp = extractFloat64Exp( a );
2246     aSign = extractFloat64Sign( a );
2247     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2248     shiftCount = 0x433 - aExp;
2249     if ( shiftCount <= 0 ) {
2250         if ( 0x43E < aExp ) {
2251             float_raise( float_flag_invalid STATUS_VAR);
2252             if (    ! aSign
2253                  || (    ( aExp == 0x7FF )
2254                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2255                ) {
2256                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2257             }
2258             return (sbits64) LIT64( 0x8000000000000000 );
2259         }
2260         aSigExtra = 0;
2261         aSig <<= - shiftCount;
2262     }
2263     else {
2264         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2265     }
2266     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2267
2268 }
2269
2270 /*----------------------------------------------------------------------------
2271 | Returns the result of converting the double-precision floating-point value
2272 | `a' to the 64-bit two's complement integer format.  The conversion is
2273 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2274 | Arithmetic, except that the conversion is always rounded toward zero.
2275 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2276 | the conversion overflows, the largest integer with the same sign as `a' is
2277 | returned.
2278 *----------------------------------------------------------------------------*/
2279
2280 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2281 {
2282     flag aSign;
2283     int16 aExp, shiftCount;
2284     bits64 aSig;
2285     int64 z;
2286
2287     aSig = extractFloat64Frac( a );
2288     aExp = extractFloat64Exp( a );
2289     aSign = extractFloat64Sign( a );
2290     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2291     shiftCount = aExp - 0x433;
2292     if ( 0 <= shiftCount ) {
2293         if ( 0x43E <= aExp ) {
2294             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2295                 float_raise( float_flag_invalid STATUS_VAR);
2296                 if (    ! aSign
2297                      || (    ( aExp == 0x7FF )
2298                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2299                    ) {
2300                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2301                 }
2302             }
2303             return (sbits64) LIT64( 0x8000000000000000 );
2304         }
2305         z = aSig<<shiftCount;
2306     }
2307     else {
2308         if ( aExp < 0x3FE ) {
2309             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2310             return 0;
2311         }
2312         z = aSig>>( - shiftCount );
2313         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2314             STATUS(float_exception_flags) |= float_flag_inexact;
2315         }
2316     }
2317     if ( aSign ) z = - z;
2318     return z;
2319
2320 }
2321
2322 /*----------------------------------------------------------------------------
2323 | Returns the result of converting the double-precision floating-point value
2324 | `a' to the single-precision floating-point format.  The conversion is
2325 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2326 | Arithmetic.
2327 *----------------------------------------------------------------------------*/
2328
2329 float32 float64_to_float32( float64 a STATUS_PARAM )
2330 {
2331     flag aSign;
2332     int16 aExp;
2333     bits64 aSig;
2334     bits32 zSig;
2335
2336     aSig = extractFloat64Frac( a );
2337     aExp = extractFloat64Exp( a );
2338     aSign = extractFloat64Sign( a );
2339     if ( aExp == 0x7FF ) {
2340         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2341         return packFloat32( aSign, 0xFF, 0 );
2342     }
2343     shift64RightJamming( aSig, 22, &aSig );
2344     zSig = aSig;
2345     if ( aExp || zSig ) {
2346         zSig |= 0x40000000;
2347         aExp -= 0x381;
2348     }
2349     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2350
2351 }
2352
2353 #ifdef FLOATX80
2354
2355 /*----------------------------------------------------------------------------
2356 | Returns the result of converting the double-precision floating-point value
2357 | `a' to the extended double-precision floating-point format.  The conversion
2358 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2359 | Arithmetic.
2360 *----------------------------------------------------------------------------*/
2361
2362 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2363 {
2364     flag aSign;
2365     int16 aExp;
2366     bits64 aSig;
2367
2368     aSig = extractFloat64Frac( a );
2369     aExp = extractFloat64Exp( a );
2370     aSign = extractFloat64Sign( a );
2371     if ( aExp == 0x7FF ) {
2372         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2373         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2374     }
2375     if ( aExp == 0 ) {
2376         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2377         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2378     }
2379     return
2380         packFloatx80(
2381             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2382
2383 }
2384
2385 #endif
2386
2387 #ifdef FLOAT128
2388
2389 /*----------------------------------------------------------------------------
2390 | Returns the result of converting the double-precision floating-point value
2391 | `a' to the quadruple-precision floating-point format.  The conversion is
2392 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2393 | Arithmetic.
2394 *----------------------------------------------------------------------------*/
2395
2396 float128 float64_to_float128( float64 a STATUS_PARAM )
2397 {
2398     flag aSign;
2399     int16 aExp;
2400     bits64 aSig, zSig0, zSig1;
2401
2402     aSig = extractFloat64Frac( a );
2403     aExp = extractFloat64Exp( a );
2404     aSign = extractFloat64Sign( a );
2405     if ( aExp == 0x7FF ) {
2406         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2407         return packFloat128( aSign, 0x7FFF, 0, 0 );
2408     }
2409     if ( aExp == 0 ) {
2410         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2411         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2412         --aExp;
2413     }
2414     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2415     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2416
2417 }
2418
2419 #endif
2420
2421 /*----------------------------------------------------------------------------
2422 | Rounds the double-precision floating-point value `a' to an integer, and
2423 | returns the result as a double-precision floating-point value.  The
2424 | operation is performed according to the IEC/IEEE Standard for Binary
2425 | Floating-Point Arithmetic.
2426 *----------------------------------------------------------------------------*/
2427
2428 float64 float64_round_to_int( float64 a STATUS_PARAM )
2429 {
2430     flag aSign;
2431     int16 aExp;
2432     bits64 lastBitMask, roundBitsMask;
2433     int8 roundingMode;
2434     float64 z;
2435
2436     aExp = extractFloat64Exp( a );
2437     if ( 0x433 <= aExp ) {
2438         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2439             return propagateFloat64NaN( a, a STATUS_VAR );
2440         }
2441         return a;
2442     }
2443     if ( aExp < 0x3FF ) {
2444         if ( (bits64) ( a<<1 ) == 0 ) return a;
2445         STATUS(float_exception_flags) |= float_flag_inexact;
2446         aSign = extractFloat64Sign( a );
2447         switch ( STATUS(float_rounding_mode) ) {
2448          case float_round_nearest_even:
2449             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2450                 return packFloat64( aSign, 0x3FF, 0 );
2451             }
2452             break;
2453          case float_round_down:
2454             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2455          case float_round_up:
2456             return
2457             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2458         }
2459         return packFloat64( aSign, 0, 0 );
2460     }
2461     lastBitMask = 1;
2462     lastBitMask <<= 0x433 - aExp;
2463     roundBitsMask = lastBitMask - 1;
2464     z = a;
2465     roundingMode = STATUS(float_rounding_mode);
2466     if ( roundingMode == float_round_nearest_even ) {
2467         z += lastBitMask>>1;
2468         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2469     }
2470     else if ( roundingMode != float_round_to_zero ) {
2471         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2472             z += roundBitsMask;
2473         }
2474     }
2475     z &= ~ roundBitsMask;
2476     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2477     return z;
2478
2479 }
2480
2481 /*----------------------------------------------------------------------------
2482 | Returns the result of adding the absolute values of the double-precision
2483 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2484 | before being returned.  `zSign' is ignored if the result is a NaN.
2485 | The addition is performed according to the IEC/IEEE Standard for Binary
2486 | Floating-Point Arithmetic.
2487 *----------------------------------------------------------------------------*/
2488
2489 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2490 {
2491     int16 aExp, bExp, zExp;
2492     bits64 aSig, bSig, zSig;
2493     int16 expDiff;
2494
2495     aSig = extractFloat64Frac( a );
2496     aExp = extractFloat64Exp( a );
2497     bSig = extractFloat64Frac( b );
2498     bExp = extractFloat64Exp( b );
2499     expDiff = aExp - bExp;
2500     aSig <<= 9;
2501     bSig <<= 9;
2502     if ( 0 < expDiff ) {
2503         if ( aExp == 0x7FF ) {
2504             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2505             return a;
2506         }
2507         if ( bExp == 0 ) {
2508             --expDiff;
2509         }
2510         else {
2511             bSig |= LIT64( 0x2000000000000000 );
2512         }
2513         shift64RightJamming( bSig, expDiff, &bSig );
2514         zExp = aExp;
2515     }
2516     else if ( expDiff < 0 ) {
2517         if ( bExp == 0x7FF ) {
2518             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2519             return packFloat64( zSign, 0x7FF, 0 );
2520         }
2521         if ( aExp == 0 ) {
2522             ++expDiff;
2523         }
2524         else {
2525             aSig |= LIT64( 0x2000000000000000 );
2526         }
2527         shift64RightJamming( aSig, - expDiff, &aSig );
2528         zExp = bExp;
2529     }
2530     else {
2531         if ( aExp == 0x7FF ) {
2532             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2533             return a;
2534         }
2535         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2536         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2537         zExp = aExp;
2538         goto roundAndPack;
2539     }
2540     aSig |= LIT64( 0x2000000000000000 );
2541     zSig = ( aSig + bSig )<<1;
2542     --zExp;
2543     if ( (sbits64) zSig < 0 ) {
2544         zSig = aSig + bSig;
2545         ++zExp;
2546     }
2547  roundAndPack:
2548     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2549
2550 }
2551
2552 /*----------------------------------------------------------------------------
2553 | Returns the result of subtracting the absolute values of the double-
2554 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2555 | difference is negated before being returned.  `zSign' is ignored if the
2556 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2557 | Standard for Binary Floating-Point Arithmetic.
2558 *----------------------------------------------------------------------------*/
2559
2560 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2561 {
2562     int16 aExp, bExp, zExp;
2563     bits64 aSig, bSig, zSig;
2564     int16 expDiff;
2565
2566     aSig = extractFloat64Frac( a );
2567     aExp = extractFloat64Exp( a );
2568     bSig = extractFloat64Frac( b );
2569     bExp = extractFloat64Exp( b );
2570     expDiff = aExp - bExp;
2571     aSig <<= 10;
2572     bSig <<= 10;
2573     if ( 0 < expDiff ) goto aExpBigger;
2574     if ( expDiff < 0 ) goto bExpBigger;
2575     if ( aExp == 0x7FF ) {
2576         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577         float_raise( float_flag_invalid STATUS_VAR);
2578         return float64_default_nan;
2579     }
2580     if ( aExp == 0 ) {
2581         aExp = 1;
2582         bExp = 1;
2583     }
2584     if ( bSig < aSig ) goto aBigger;
2585     if ( aSig < bSig ) goto bBigger;
2586     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2587  bExpBigger:
2588     if ( bExp == 0x7FF ) {
2589         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2590         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2591     }
2592     if ( aExp == 0 ) {
2593         ++expDiff;
2594     }
2595     else {
2596         aSig |= LIT64( 0x4000000000000000 );
2597     }
2598     shift64RightJamming( aSig, - expDiff, &aSig );
2599     bSig |= LIT64( 0x4000000000000000 );
2600  bBigger:
2601     zSig = bSig - aSig;
2602     zExp = bExp;
2603     zSign ^= 1;
2604     goto normalizeRoundAndPack;
2605  aExpBigger:
2606     if ( aExp == 0x7FF ) {
2607         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2608         return a;
2609     }
2610     if ( bExp == 0 ) {
2611         --expDiff;
2612     }
2613     else {
2614         bSig |= LIT64( 0x4000000000000000 );
2615     }
2616     shift64RightJamming( bSig, expDiff, &bSig );
2617     aSig |= LIT64( 0x4000000000000000 );
2618  aBigger:
2619     zSig = aSig - bSig;
2620     zExp = aExp;
2621  normalizeRoundAndPack:
2622     --zExp;
2623     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2624
2625 }
2626
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of adding the double-precision floating-point values `a'
2629 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2630 | Binary Floating-Point Arithmetic.
2631 *----------------------------------------------------------------------------*/
2632
2633 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2634 {
2635     flag aSign, bSign;
2636
2637     aSign = extractFloat64Sign( a );
2638     bSign = extractFloat64Sign( b );
2639     if ( aSign == bSign ) {
2640         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2641     }
2642     else {
2643         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2644     }
2645
2646 }
2647
2648 /*----------------------------------------------------------------------------
2649 | Returns the result of subtracting the double-precision floating-point values
2650 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2651 | for Binary Floating-Point Arithmetic.
2652 *----------------------------------------------------------------------------*/
2653
2654 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2655 {
2656     flag aSign, bSign;
2657
2658     aSign = extractFloat64Sign( a );
2659     bSign = extractFloat64Sign( b );
2660     if ( aSign == bSign ) {
2661         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2662     }
2663     else {
2664         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2665     }
2666
2667 }
2668
2669 /*----------------------------------------------------------------------------
2670 | Returns the result of multiplying the double-precision floating-point values
2671 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2672 | for Binary Floating-Point Arithmetic.
2673 *----------------------------------------------------------------------------*/
2674
2675 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2676 {
2677     flag aSign, bSign, zSign;
2678     int16 aExp, bExp, zExp;
2679     bits64 aSig, bSig, zSig0, zSig1;
2680
2681     aSig = extractFloat64Frac( a );
2682     aExp = extractFloat64Exp( a );
2683     aSign = extractFloat64Sign( a );
2684     bSig = extractFloat64Frac( b );
2685     bExp = extractFloat64Exp( b );
2686     bSign = extractFloat64Sign( b );
2687     zSign = aSign ^ bSign;
2688     if ( aExp == 0x7FF ) {
2689         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2690             return propagateFloat64NaN( a, b STATUS_VAR );
2691         }
2692         if ( ( bExp | bSig ) == 0 ) {
2693             float_raise( float_flag_invalid STATUS_VAR);
2694             return float64_default_nan;
2695         }
2696         return packFloat64( zSign, 0x7FF, 0 );
2697     }
2698     if ( bExp == 0x7FF ) {
2699         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2700         if ( ( aExp | aSig ) == 0 ) {
2701             float_raise( float_flag_invalid STATUS_VAR);
2702             return float64_default_nan;
2703         }
2704         return packFloat64( zSign, 0x7FF, 0 );
2705     }
2706     if ( aExp == 0 ) {
2707         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2708         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2709     }
2710     if ( bExp == 0 ) {
2711         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2712         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2713     }
2714     zExp = aExp + bExp - 0x3FF;
2715     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2716     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2717     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2718     zSig0 |= ( zSig1 != 0 );
2719     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2720         zSig0 <<= 1;
2721         --zExp;
2722     }
2723     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2724
2725 }
2726
2727 /*----------------------------------------------------------------------------
2728 | Returns the result of dividing the double-precision floating-point value `a'
2729 | by the corresponding value `b'.  The operation is performed according to
2730 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2732
2733 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2734 {
2735     flag aSign, bSign, zSign;
2736     int16 aExp, bExp, zExp;
2737     bits64 aSig, bSig, zSig;
2738     bits64 rem0, rem1;
2739     bits64 term0, term1;
2740
2741     aSig = extractFloat64Frac( a );
2742     aExp = extractFloat64Exp( a );
2743     aSign = extractFloat64Sign( a );
2744     bSig = extractFloat64Frac( b );
2745     bExp = extractFloat64Exp( b );
2746     bSign = extractFloat64Sign( b );
2747     zSign = aSign ^ bSign;
2748     if ( aExp == 0x7FF ) {
2749         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2750         if ( bExp == 0x7FF ) {
2751             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2752             float_raise( float_flag_invalid STATUS_VAR);
2753             return float64_default_nan;
2754         }
2755         return packFloat64( zSign, 0x7FF, 0 );
2756     }
2757     if ( bExp == 0x7FF ) {
2758         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2759         return packFloat64( zSign, 0, 0 );
2760     }
2761     if ( bExp == 0 ) {
2762         if ( bSig == 0 ) {
2763             if ( ( aExp | aSig ) == 0 ) {
2764                 float_raise( float_flag_invalid STATUS_VAR);
2765                 return float64_default_nan;
2766             }
2767             float_raise( float_flag_divbyzero STATUS_VAR);
2768             return packFloat64( zSign, 0x7FF, 0 );
2769         }
2770         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2771     }
2772     if ( aExp == 0 ) {
2773         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2774         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2775     }
2776     zExp = aExp - bExp + 0x3FD;
2777     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2778     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2779     if ( bSig <= ( aSig + aSig ) ) {
2780         aSig >>= 1;
2781         ++zExp;
2782     }
2783     zSig = estimateDiv128To64( aSig, 0, bSig );
2784     if ( ( zSig & 0x1FF ) <= 2 ) {
2785         mul64To128( bSig, zSig, &term0, &term1 );
2786         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2787         while ( (sbits64) rem0 < 0 ) {
2788             --zSig;
2789             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2790         }
2791         zSig |= ( rem1 != 0 );
2792     }
2793     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2794
2795 }
2796
2797 /*----------------------------------------------------------------------------
2798 | Returns the remainder of the double-precision floating-point value `a'
2799 | with respect to the corresponding value `b'.  The operation is performed
2800 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2801 *----------------------------------------------------------------------------*/
2802
2803 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2804 {
2805     flag aSign, bSign, zSign;
2806     int16 aExp, bExp, expDiff;
2807     bits64 aSig, bSig;
2808     bits64 q, alternateASig;
2809     sbits64 sigMean;
2810
2811     aSig = extractFloat64Frac( a );
2812     aExp = extractFloat64Exp( a );
2813     aSign = extractFloat64Sign( a );
2814     bSig = extractFloat64Frac( b );
2815     bExp = extractFloat64Exp( b );
2816     bSign = extractFloat64Sign( b );
2817     if ( aExp == 0x7FF ) {
2818         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2819             return propagateFloat64NaN( a, b STATUS_VAR );
2820         }
2821         float_raise( float_flag_invalid STATUS_VAR);
2822         return float64_default_nan;
2823     }
2824     if ( bExp == 0x7FF ) {
2825         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2826         return a;
2827     }
2828     if ( bExp == 0 ) {
2829         if ( bSig == 0 ) {
2830             float_raise( float_flag_invalid STATUS_VAR);
2831             return float64_default_nan;
2832         }
2833         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834     }
2835     if ( aExp == 0 ) {
2836         if ( aSig == 0 ) return a;
2837         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838     }
2839     expDiff = aExp - bExp;
2840     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2841     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842     if ( expDiff < 0 ) {
2843         if ( expDiff < -1 ) return a;
2844         aSig >>= 1;
2845     }
2846     q = ( bSig <= aSig );
2847     if ( q ) aSig -= bSig;
2848     expDiff -= 64;
2849     while ( 0 < expDiff ) {
2850         q = estimateDiv128To64( aSig, 0, bSig );
2851         q = ( 2 < q ) ? q - 2 : 0;
2852         aSig = - ( ( bSig>>2 ) * q );
2853         expDiff -= 62;
2854     }
2855     expDiff += 64;
2856     if ( 0 < expDiff ) {
2857         q = estimateDiv128To64( aSig, 0, bSig );
2858         q = ( 2 < q ) ? q - 2 : 0;
2859         q >>= 64 - expDiff;
2860         bSig >>= 2;
2861         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2862     }
2863     else {
2864         aSig >>= 2;
2865         bSig >>= 2;
2866     }
2867     do {
2868         alternateASig = aSig;
2869         ++q;
2870         aSig -= bSig;
2871     } while ( 0 <= (sbits64) aSig );
2872     sigMean = aSig + alternateASig;
2873     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2874         aSig = alternateASig;
2875     }
2876     zSign = ( (sbits64) aSig < 0 );
2877     if ( zSign ) aSig = - aSig;
2878     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2879
2880 }
2881
2882 /*----------------------------------------------------------------------------
2883 | Returns the square root of the double-precision floating-point value `a'.
2884 | The operation is performed according to the IEC/IEEE Standard for Binary
2885 | Floating-Point Arithmetic.
2886 *----------------------------------------------------------------------------*/
2887
2888 float64 float64_sqrt( float64 a STATUS_PARAM )
2889 {
2890     flag aSign;
2891     int16 aExp, zExp;
2892     bits64 aSig, zSig, doubleZSig;
2893     bits64 rem0, rem1, term0, term1;
2894
2895     aSig = extractFloat64Frac( a );
2896     aExp = extractFloat64Exp( a );
2897     aSign = extractFloat64Sign( a );
2898     if ( aExp == 0x7FF ) {
2899         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2900         if ( ! aSign ) return a;
2901         float_raise( float_flag_invalid STATUS_VAR);
2902         return float64_default_nan;
2903     }
2904     if ( aSign ) {
2905         if ( ( aExp | aSig ) == 0 ) return a;
2906         float_raise( float_flag_invalid STATUS_VAR);
2907         return float64_default_nan;
2908     }
2909     if ( aExp == 0 ) {
2910         if ( aSig == 0 ) return 0;
2911         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2912     }
2913     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2914     aSig |= LIT64( 0x0010000000000000 );
2915     zSig = estimateSqrt32( aExp, aSig>>21 );
2916     aSig <<= 9 - ( aExp & 1 );
2917     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2918     if ( ( zSig & 0x1FF ) <= 5 ) {
2919         doubleZSig = zSig<<1;
2920         mul64To128( zSig, zSig, &term0, &term1 );
2921         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2922         while ( (sbits64) rem0 < 0 ) {
2923             --zSig;
2924             doubleZSig -= 2;
2925             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2926         }
2927         zSig |= ( ( rem0 | rem1 ) != 0 );
2928     }
2929     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2930
2931 }
2932
2933 /*----------------------------------------------------------------------------
2934 | Returns 1 if the double-precision floating-point value `a' is equal to the
2935 | corresponding value `b', and 0 otherwise.  The comparison is performed
2936 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2937 *----------------------------------------------------------------------------*/
2938
2939 flag float64_eq( float64 a, float64 b STATUS_PARAM )
2940 {
2941
2942     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2943          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2944        ) {
2945         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2946             float_raise( float_flag_invalid STATUS_VAR);
2947         }
2948         return 0;
2949     }
2950     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2951
2952 }
2953
2954 /*----------------------------------------------------------------------------
2955 | Returns 1 if the double-precision floating-point value `a' is less than or
2956 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2957 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2958 | Arithmetic.
2959 *----------------------------------------------------------------------------*/
2960
2961 flag float64_le( float64 a, float64 b STATUS_PARAM )
2962 {
2963     flag aSign, bSign;
2964
2965     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2966          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2967        ) {
2968         float_raise( float_flag_invalid STATUS_VAR);
2969         return 0;
2970     }
2971     aSign = extractFloat64Sign( a );
2972     bSign = extractFloat64Sign( b );
2973     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2974     return ( a == b ) || ( aSign ^ ( a < b ) );
2975
2976 }
2977
2978 /*----------------------------------------------------------------------------
2979 | Returns 1 if the double-precision floating-point value `a' is less than
2980 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2981 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2982 *----------------------------------------------------------------------------*/
2983
2984 flag float64_lt( float64 a, float64 b STATUS_PARAM )
2985 {
2986     flag aSign, bSign;
2987
2988     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2989          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2990        ) {
2991         float_raise( float_flag_invalid STATUS_VAR);
2992         return 0;
2993     }
2994     aSign = extractFloat64Sign( a );
2995     bSign = extractFloat64Sign( b );
2996     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2997     return ( a != b ) && ( aSign ^ ( a < b ) );
2998
2999 }
3000
3001 /*----------------------------------------------------------------------------
3002 | Returns 1 if the double-precision floating-point value `a' is equal to the
3003 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3004 | if either operand is a NaN.  Otherwise, the comparison is performed
3005 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3006 *----------------------------------------------------------------------------*/
3007
3008 flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3009 {
3010
3011     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3012          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3013        ) {
3014         float_raise( float_flag_invalid STATUS_VAR);
3015         return 0;
3016     }
3017     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3018
3019 }
3020
3021 /*----------------------------------------------------------------------------
3022 | Returns 1 if the double-precision floating-point value `a' is less than or
3023 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3024 | cause an exception.  Otherwise, the comparison is performed according to the
3025 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026 *----------------------------------------------------------------------------*/
3027
3028 flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3029 {
3030     flag aSign, bSign;
3031
3032     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034        ) {
3035         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3036             float_raise( float_flag_invalid STATUS_VAR);
3037         }
3038         return 0;
3039     }
3040     aSign = extractFloat64Sign( a );
3041     bSign = extractFloat64Sign( b );
3042     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3043     return ( a == b ) || ( aSign ^ ( a < b ) );
3044
3045 }
3046
3047 /*----------------------------------------------------------------------------
3048 | Returns 1 if the double-precision floating-point value `a' is less than
3049 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3050 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3051 | Standard for Binary Floating-Point Arithmetic.
3052 *----------------------------------------------------------------------------*/
3053
3054 flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3055 {
3056     flag aSign, bSign;
3057
3058     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3060        ) {
3061         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3062             float_raise( float_flag_invalid STATUS_VAR);
3063         }
3064         return 0;
3065     }
3066     aSign = extractFloat64Sign( a );
3067     bSign = extractFloat64Sign( b );
3068     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3069     return ( a != b ) && ( aSign ^ ( a < b ) );
3070
3071 }
3072
3073 #ifdef FLOATX80
3074
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the extended double-precision floating-
3077 | point value `a' to the 32-bit two's complement integer format.  The
3078 | conversion is performed according to the IEC/IEEE Standard for Binary
3079 | Floating-Point Arithmetic---which means in particular that the conversion
3080 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3081 | largest positive integer is returned.  Otherwise, if the conversion
3082 | overflows, the largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3084
3085 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3086 {
3087     flag aSign;
3088     int32 aExp, shiftCount;
3089     bits64 aSig;
3090
3091     aSig = extractFloatx80Frac( a );
3092     aExp = extractFloatx80Exp( a );
3093     aSign = extractFloatx80Sign( a );
3094     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3095     shiftCount = 0x4037 - aExp;
3096     if ( shiftCount <= 0 ) shiftCount = 1;
3097     shift64RightJamming( aSig, shiftCount, &aSig );
3098     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3099
3100 }
3101
3102 /*----------------------------------------------------------------------------
3103 | Returns the result of converting the extended double-precision floating-
3104 | point value `a' to the 32-bit two's complement integer format.  The
3105 | conversion is performed according to the IEC/IEEE Standard for Binary
3106 | Floating-Point Arithmetic, except that the conversion is always rounded
3107 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3108 | Otherwise, if the conversion overflows, the largest integer with the same
3109 | sign as `a' is returned.
3110 *----------------------------------------------------------------------------*/
3111
3112 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3113 {
3114     flag aSign;
3115     int32 aExp, shiftCount;
3116     bits64 aSig, savedASig;
3117     int32 z;
3118
3119     aSig = extractFloatx80Frac( a );
3120     aExp = extractFloatx80Exp( a );
3121     aSign = extractFloatx80Sign( a );
3122     if ( 0x401E < aExp ) {
3123         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3124         goto invalid;
3125     }
3126     else if ( aExp < 0x3FFF ) {
3127         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3128         return 0;
3129     }
3130     shiftCount = 0x403E - aExp;
3131     savedASig = aSig;
3132     aSig >>= shiftCount;
3133     z = aSig;
3134     if ( aSign ) z = - z;
3135     if ( ( z < 0 ) ^ aSign ) {
3136  invalid:
3137         float_raise( float_flag_invalid STATUS_VAR);
3138         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3139     }
3140     if ( ( aSig<<shiftCount ) != savedASig ) {
3141         STATUS(float_exception_flags) |= float_flag_inexact;
3142     }
3143     return z;
3144
3145 }
3146
3147 /*----------------------------------------------------------------------------
3148 | Returns the result of converting the extended double-precision floating-
3149 | point value `a' to the 64-bit two's complement integer format.  The
3150 | conversion is performed according to the IEC/IEEE Standard for Binary
3151 | Floating-Point Arithmetic---which means in particular that the conversion
3152 | is rounded according to the current rounding mode.  If `a' is a NaN,
3153 | the largest positive integer is returned.  Otherwise, if the conversion
3154 | overflows, the largest integer with the same sign as `a' is returned.
3155 *----------------------------------------------------------------------------*/
3156
3157 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3158 {
3159     flag aSign;
3160     int32 aExp, shiftCount;
3161     bits64 aSig, aSigExtra;
3162
3163     aSig = extractFloatx80Frac( a );
3164     aExp = extractFloatx80Exp( a );
3165     aSign = extractFloatx80Sign( a );
3166     shiftCount = 0x403E - aExp;
3167     if ( shiftCount <= 0 ) {
3168         if ( shiftCount ) {
3169             float_raise( float_flag_invalid STATUS_VAR);
3170             if (    ! aSign
3171                  || (    ( aExp == 0x7FFF )
3172                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3173                ) {
3174                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3175             }
3176             return (sbits64) LIT64( 0x8000000000000000 );
3177         }
3178         aSigExtra = 0;
3179     }
3180     else {
3181         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3182     }
3183     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3184
3185 }
3186
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of converting the extended double-precision floating-
3189 | point value `a' to the 64-bit two's complement integer format.  The
3190 | conversion is performed according to the IEC/IEEE Standard for Binary
3191 | Floating-Point Arithmetic, except that the conversion is always rounded
3192 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3193 | Otherwise, if the conversion overflows, the largest integer with the same
3194 | sign as `a' is returned.
3195 *----------------------------------------------------------------------------*/
3196
3197 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3198 {
3199     flag aSign;
3200     int32 aExp, shiftCount;
3201     bits64 aSig;
3202     int64 z;
3203
3204     aSig = extractFloatx80Frac( a );
3205     aExp = extractFloatx80Exp( a );
3206     aSign = extractFloatx80Sign( a );
3207     shiftCount = aExp - 0x403E;
3208     if ( 0 <= shiftCount ) {
3209         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3210         if ( ( a.high != 0xC03E ) || aSig ) {
3211             float_raise( float_flag_invalid STATUS_VAR);
3212             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3213                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3214             }
3215         }
3216         return (sbits64) LIT64( 0x8000000000000000 );
3217     }
3218     else if ( aExp < 0x3FFF ) {
3219         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3220         return 0;
3221     }
3222     z = aSig>>( - shiftCount );
3223     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3224         STATUS(float_exception_flags) |= float_flag_inexact;
3225     }
3226     if ( aSign ) z = - z;
3227     return z;
3228
3229 }
3230
3231 /*----------------------------------------------------------------------------
3232 | Returns the result of converting the extended double-precision floating-
3233 | point value `a' to the single-precision floating-point format.  The
3234 | conversion is performed according to the IEC/IEEE Standard for Binary
3235 | Floating-Point Arithmetic.
3236 *----------------------------------------------------------------------------*/
3237
3238 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3239 {
3240     flag aSign;
3241     int32 aExp;
3242     bits64 aSig;
3243
3244     aSig = extractFloatx80Frac( a );
3245     aExp = extractFloatx80Exp( a );
3246     aSign = extractFloatx80Sign( a );
3247     if ( aExp == 0x7FFF ) {
3248         if ( (bits64) ( aSig<<1 ) ) {
3249             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3250         }
3251         return packFloat32( aSign, 0xFF, 0 );
3252     }
3253     shift64RightJamming( aSig, 33, &aSig );
3254     if ( aExp || aSig ) aExp -= 0x3F81;
3255     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3256
3257 }
3258
3259 /*----------------------------------------------------------------------------
3260 | Returns the result of converting the extended double-precision floating-
3261 | point value `a' to the double-precision floating-point format.  The
3262 | conversion is performed according to the IEC/IEEE Standard for Binary
3263 | Floating-Point Arithmetic.
3264 *----------------------------------------------------------------------------*/
3265
3266 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3267 {
3268     flag aSign;
3269     int32 aExp;
3270     bits64 aSig, zSig;
3271
3272     aSig = extractFloatx80Frac( a );
3273     aExp = extractFloatx80Exp( a );
3274     aSign = extractFloatx80Sign( a );
3275     if ( aExp == 0x7FFF ) {
3276         if ( (bits64) ( aSig<<1 ) ) {
3277             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3278         }
3279         return packFloat64( aSign, 0x7FF, 0 );
3280     }
3281     shift64RightJamming( aSig, 1, &zSig );
3282     if ( aExp || aSig ) aExp -= 0x3C01;
3283     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3284
3285 }
3286
3287 #ifdef FLOAT128
3288
3289 /*----------------------------------------------------------------------------
3290 | Returns the result of converting the extended double-precision floating-
3291 | point value `a' to the quadruple-precision floating-point format.  The
3292 | conversion is performed according to the IEC/IEEE Standard for Binary
3293 | Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3295
3296 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3297 {
3298     flag aSign;
3299     int16 aExp;
3300     bits64 aSig, zSig0, zSig1;
3301
3302     aSig = extractFloatx80Frac( a );
3303     aExp = extractFloatx80Exp( a );
3304     aSign = extractFloatx80Sign( a );
3305     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3306         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3307     }
3308     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3309     return packFloat128( aSign, aExp, zSig0, zSig1 );
3310
3311 }
3312
3313 #endif
3314
3315 /*----------------------------------------------------------------------------
3316 | Rounds the extended double-precision floating-point value `a' to an integer,
3317 | and returns the result as an extended quadruple-precision floating-point
3318 | value.  The operation is performed according to the IEC/IEEE Standard for
3319 | Binary Floating-Point Arithmetic.
3320 *----------------------------------------------------------------------------*/
3321
3322 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3323 {
3324     flag aSign;
3325     int32 aExp;
3326     bits64 lastBitMask, roundBitsMask;
3327     int8 roundingMode;
3328     floatx80 z;
3329
3330     aExp = extractFloatx80Exp( a );
3331     if ( 0x403E <= aExp ) {
3332         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3333             return propagateFloatx80NaN( a, a STATUS_VAR );
3334         }
3335         return a;
3336     }
3337     if ( aExp < 0x3FFF ) {
3338         if (    ( aExp == 0 )
3339              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3340             return a;
3341         }
3342         STATUS(float_exception_flags) |= float_flag_inexact;
3343         aSign = extractFloatx80Sign( a );
3344         switch ( STATUS(float_rounding_mode) ) {
3345          case float_round_nearest_even:
3346             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3347                ) {
3348                 return
3349                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3350             }
3351             break;
3352          case float_round_down:
3353             return
3354                   aSign ?
3355                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3356                 : packFloatx80( 0, 0, 0 );
3357          case float_round_up:
3358             return
3359                   aSign ? packFloatx80( 1, 0, 0 )
3360                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3361         }
3362         return packFloatx80( aSign, 0, 0 );
3363     }
3364     lastBitMask = 1;
3365     lastBitMask <<= 0x403E - aExp;
3366     roundBitsMask = lastBitMask - 1;
3367     z = a;
3368     roundingMode = STATUS(float_rounding_mode);
3369     if ( roundingMode == float_round_nearest_even ) {
3370         z.low += lastBitMask>>1;
3371         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3372     }
3373     else if ( roundingMode != float_round_to_zero ) {
3374         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3375             z.low += roundBitsMask;
3376         }
3377     }
3378     z.low &= ~ roundBitsMask;
3379     if ( z.low == 0 ) {
3380         ++z.high;
3381         z.low = LIT64( 0x8000000000000000 );
3382     }
3383     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3384     return z;
3385
3386 }
3387
3388 /*----------------------------------------------------------------------------
3389 | Returns the result of adding the absolute values of the extended double-
3390 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3391 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3392 | The addition is performed according to the IEC/IEEE Standard for Binary
3393 | Floating-Point Arithmetic.
3394 *----------------------------------------------------------------------------*/
3395
3396 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3397 {
3398     int32 aExp, bExp, zExp;
3399     bits64 aSig, bSig, zSig0, zSig1;
3400     int32 expDiff;
3401
3402     aSig = extractFloatx80Frac( a );
3403     aExp = extractFloatx80Exp( a );
3404     bSig = extractFloatx80Frac( b );
3405     bExp = extractFloatx80Exp( b );
3406     expDiff = aExp - bExp;
3407     if ( 0 < expDiff ) {
3408         if ( aExp == 0x7FFF ) {
3409             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3410             return a;
3411         }
3412         if ( bExp == 0 ) --expDiff;
3413         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3414         zExp = aExp;
3415     }
3416     else if ( expDiff < 0 ) {
3417         if ( bExp == 0x7FFF ) {
3418             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3419             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3420         }
3421         if ( aExp == 0 ) ++expDiff;
3422         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3423         zExp = bExp;
3424     }
3425     else {
3426         if ( aExp == 0x7FFF ) {
3427             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3428                 return propagateFloatx80NaN( a, b STATUS_VAR );
3429             }
3430             return a;
3431         }
3432         zSig1 = 0;
3433         zSig0 = aSig + bSig;
3434         if ( aExp == 0 ) {
3435             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3436             goto roundAndPack;
3437         }
3438         zExp = aExp;
3439         goto shiftRight1;
3440     }
3441     zSig0 = aSig + bSig;
3442     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3443  shiftRight1:
3444     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3445     zSig0 |= LIT64( 0x8000000000000000 );
3446     ++zExp;
3447  roundAndPack:
3448     return
3449         roundAndPackFloatx80(
3450             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3451
3452 }
3453
3454 /*----------------------------------------------------------------------------
3455 | Returns the result of subtracting the absolute values of the extended
3456 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3457 | difference is negated before being returned.  `zSign' is ignored if the
3458 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3459 | Standard for Binary Floating-Point Arithmetic.
3460 *----------------------------------------------------------------------------*/
3461
3462 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3463 {
3464     int32 aExp, bExp, zExp;
3465     bits64 aSig, bSig, zSig0, zSig1;
3466     int32 expDiff;
3467     floatx80 z;
3468
3469     aSig = extractFloatx80Frac( a );
3470     aExp = extractFloatx80Exp( a );
3471     bSig = extractFloatx80Frac( b );
3472     bExp = extractFloatx80Exp( b );
3473     expDiff = aExp - bExp;
3474     if ( 0 < expDiff ) goto aExpBigger;
3475     if ( expDiff < 0 ) goto bExpBigger;
3476     if ( aExp == 0x7FFF ) {
3477         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3478             return propagateFloatx80NaN( a, b STATUS_VAR );
3479         }
3480         float_raise( float_flag_invalid STATUS_VAR);
3481         z.low = floatx80_default_nan_low;
3482         z.high = floatx80_default_nan_high;
3483         return z;
3484     }
3485     if ( aExp == 0 ) {
3486         aExp = 1;
3487         bExp = 1;
3488     }
3489     zSig1 = 0;
3490     if ( bSig < aSig ) goto aBigger;
3491     if ( aSig < bSig ) goto bBigger;
3492     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3493  bExpBigger:
3494     if ( bExp == 0x7FFF ) {
3495         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3496         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3497     }
3498     if ( aExp == 0 ) ++expDiff;
3499     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3500  bBigger:
3501     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3502     zExp = bExp;
3503     zSign ^= 1;
3504     goto normalizeRoundAndPack;
3505  aExpBigger:
3506     if ( aExp == 0x7FFF ) {
3507         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3508         return a;
3509     }
3510     if ( bExp == 0 ) --expDiff;
3511     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3512  aBigger:
3513     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3514     zExp = aExp;
3515  normalizeRoundAndPack:
3516     return
3517         normalizeRoundAndPackFloatx80(
3518             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3519
3520 }
3521
3522 /*----------------------------------------------------------------------------
3523 | Returns the result of adding the extended double-precision floating-point
3524 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3525 | Standard for Binary Floating-Point Arithmetic.
3526 *----------------------------------------------------------------------------*/
3527
3528 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3529 {
3530     flag aSign, bSign;
3531
3532     aSign = extractFloatx80Sign( a );
3533     bSign = extractFloatx80Sign( b );
3534     if ( aSign == bSign ) {
3535         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3536     }
3537     else {
3538         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3539     }
3540
3541 }
3542
3543 /*----------------------------------------------------------------------------
3544 | Returns the result of subtracting the extended double-precision floating-
3545 | point values `a' and `b'.  The operation is performed according to the
3546 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3547 *----------------------------------------------------------------------------*/
3548
3549 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3550 {
3551     flag aSign, bSign;
3552
3553     aSign = extractFloatx80Sign( a );
3554     bSign = extractFloatx80Sign( b );
3555     if ( aSign == bSign ) {
3556         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3557     }
3558     else {
3559         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3560     }
3561
3562 }
3563
3564 /*----------------------------------------------------------------------------
3565 | Returns the result of multiplying the extended double-precision floating-
3566 | point values `a' and `b'.  The operation is performed according to the
3567 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3568 *----------------------------------------------------------------------------*/
3569
3570 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3571 {
3572     flag aSign, bSign, zSign;
3573     int32 aExp, bExp, zExp;
3574     bits64 aSig, bSig, zSig0, zSig1;
3575     floatx80 z;
3576
3577     aSig = extractFloatx80Frac( a );
3578     aExp = extractFloatx80Exp( a );
3579     aSign = extractFloatx80Sign( a );
3580     bSig = extractFloatx80Frac( b );
3581     bExp = extractFloatx80Exp( b );
3582     bSign = extractFloatx80Sign( b );
3583     zSign = aSign ^ bSign;
3584     if ( aExp == 0x7FFF ) {
3585         if (    (bits64) ( aSig<<1 )
3586              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3587             return propagateFloatx80NaN( a, b STATUS_VAR );
3588         }
3589         if ( ( bExp | bSig ) == 0 ) goto invalid;
3590         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3591     }
3592     if ( bExp == 0x7FFF ) {
3593         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3594         if ( ( aExp | aSig ) == 0 ) {
3595  invalid:
3596             float_raise( float_flag_invalid STATUS_VAR);
3597             z.low = floatx80_default_nan_low;
3598             z.high = floatx80_default_nan_high;
3599             return z;
3600         }
3601         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3602     }
3603     if ( aExp == 0 ) {
3604         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3605         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3606     }
3607     if ( bExp == 0 ) {
3608         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3609         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3610     }
3611     zExp = aExp + bExp - 0x3FFE;
3612     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3613     if ( 0 < (sbits64) zSig0 ) {
3614         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3615         --zExp;
3616     }
3617     return
3618         roundAndPackFloatx80(
3619             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3620
3621 }
3622
3623 /*----------------------------------------------------------------------------
3624 | Returns the result of dividing the extended double-precision floating-point
3625 | value `a' by the corresponding value `b'.  The operation is performed
3626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3627 *----------------------------------------------------------------------------*/
3628
3629 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3630 {
3631     flag aSign, bSign, zSign;
3632     int32 aExp, bExp, zExp;
3633     bits64 aSig, bSig, zSig0, zSig1;
3634     bits64 rem0, rem1, rem2, term0, term1, term2;
3635     floatx80 z;
3636
3637     aSig = extractFloatx80Frac( a );
3638     aExp = extractFloatx80Exp( a );
3639     aSign = extractFloatx80Sign( a );
3640     bSig = extractFloatx80Frac( b );
3641     bExp = extractFloatx80Exp( b );
3642     bSign = extractFloatx80Sign( b );
3643     zSign = aSign ^ bSign;
3644     if ( aExp == 0x7FFF ) {
3645         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3646         if ( bExp == 0x7FFF ) {
3647             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3648             goto invalid;
3649         }
3650         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3651     }
3652     if ( bExp == 0x7FFF ) {
3653         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3654         return packFloatx80( zSign, 0, 0 );
3655     }
3656     if ( bExp == 0 ) {
3657         if ( bSig == 0 ) {
3658             if ( ( aExp | aSig ) == 0 ) {
3659  invalid:
3660                 float_raise( float_flag_invalid STATUS_VAR);
3661                 z.low = floatx80_default_nan_low;
3662                 z.high = floatx80_default_nan_high;
3663                 return z;
3664             }
3665             float_raise( float_flag_divbyzero STATUS_VAR);
3666             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3667         }
3668         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3669     }
3670     if ( aExp == 0 ) {
3671         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3672         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3673     }
3674     zExp = aExp - bExp + 0x3FFE;
3675     rem1 = 0;
3676     if ( bSig <= aSig ) {
3677         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3678         ++zExp;
3679     }
3680     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3681     mul64To128( bSig, zSig0, &term0, &term1 );
3682     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3683     while ( (sbits64) rem0 < 0 ) {
3684         --zSig0;
3685         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3686     }
3687     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3688     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3689         mul64To128( bSig, zSig1, &term1, &term2 );
3690         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3691         while ( (sbits64) rem1 < 0 ) {
3692             --zSig1;
3693             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3694         }
3695         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3696     }
3697     return
3698         roundAndPackFloatx80(
3699             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3700
3701 }
3702
3703 /*----------------------------------------------------------------------------
3704 | Returns the remainder of the extended double-precision floating-point value
3705 | `a' with respect to the corresponding value `b'.  The operation is performed
3706 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3707 *----------------------------------------------------------------------------*/
3708
3709 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3710 {
3711     flag aSign, bSign, zSign;
3712     int32 aExp, bExp, expDiff;
3713     bits64 aSig0, aSig1, bSig;
3714     bits64 q, term0, term1, alternateASig0, alternateASig1;
3715     floatx80 z;
3716
3717     aSig0 = extractFloatx80Frac( a );
3718     aExp = extractFloatx80Exp( a );
3719     aSign = extractFloatx80Sign( a );
3720     bSig = extractFloatx80Frac( b );
3721     bExp = extractFloatx80Exp( b );
3722     bSign = extractFloatx80Sign( b );
3723     if ( aExp == 0x7FFF ) {
3724         if (    (bits64) ( aSig0<<1 )
3725              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3726             return propagateFloatx80NaN( a, b STATUS_VAR );
3727         }
3728         goto invalid;
3729     }
3730     if ( bExp == 0x7FFF ) {
3731         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3732         return a;
3733     }
3734     if ( bExp == 0 ) {
3735         if ( bSig == 0 ) {
3736  invalid:
3737             float_raise( float_flag_invalid STATUS_VAR);
3738             z.low = floatx80_default_nan_low;
3739             z.high = floatx80_default_nan_high;
3740             return z;
3741         }
3742         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3743     }
3744     if ( aExp == 0 ) {
3745         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3746         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3747     }
3748     bSig |= LIT64( 0x8000000000000000 );
3749     zSign = aSign;
3750     expDiff = aExp - bExp;
3751     aSig1 = 0;
3752     if ( expDiff < 0 ) {
3753         if ( expDiff < -1 ) return a;
3754         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3755         expDiff = 0;
3756     }
3757     q = ( bSig <= aSig0 );
3758     if ( q ) aSig0 -= bSig;
3759     expDiff -= 64;
3760     while ( 0 < expDiff ) {
3761         q = estimateDiv128To64( aSig0, aSig1, bSig );
3762         q = ( 2 < q ) ? q - 2 : 0;
3763         mul64To128( bSig, q, &term0, &term1 );
3764         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3765         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3766         expDiff -= 62;
3767     }
3768     expDiff += 64;
3769     if ( 0 < expDiff ) {
3770         q = estimateDiv128To64( aSig0, aSig1, bSig );
3771         q = ( 2 < q ) ? q - 2 : 0;
3772         q >>= 64 - expDiff;
3773         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3774         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3775         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3776         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3777             ++q;
3778             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3779         }
3780     }
3781     else {
3782         term1 = 0;
3783         term0 = bSig;
3784     }
3785     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3786     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3787          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3788               && ( q & 1 ) )
3789        ) {
3790         aSig0 = alternateASig0;
3791         aSig1 = alternateASig1;
3792         zSign = ! zSign;
3793     }
3794     return
3795         normalizeRoundAndPackFloatx80(
3796             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3797
3798 }
3799
3800 /*----------------------------------------------------------------------------
3801 | Returns the square root of the extended double-precision floating-point
3802 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3803 | for Binary Floating-Point Arithmetic.
3804 *----------------------------------------------------------------------------*/
3805
3806 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3807 {
3808     flag aSign;
3809     int32 aExp, zExp;
3810     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3811     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3812     floatx80 z;
3813
3814     aSig0 = extractFloatx80Frac( a );
3815     aExp = extractFloatx80Exp( a );
3816     aSign = extractFloatx80Sign( a );
3817     if ( aExp == 0x7FFF ) {
3818         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3819         if ( ! aSign ) return a;
3820         goto invalid;
3821     }
3822     if ( aSign ) {
3823         if ( ( aExp | aSig0 ) == 0 ) return a;
3824  invalid:
3825         float_raise( float_flag_invalid STATUS_VAR);
3826         z.low = floatx80_default_nan_low;
3827         z.high = floatx80_default_nan_high;
3828         return z;
3829     }
3830     if ( aExp == 0 ) {
3831         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3832         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3833     }
3834     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3835     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3836     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3837     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3838     doubleZSig0 = zSig0<<1;
3839     mul64To128( zSig0, zSig0, &term0, &term1 );
3840     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3841     while ( (sbits64) rem0 < 0 ) {
3842         --zSig0;
3843         doubleZSig0 -= 2;
3844         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3845     }
3846     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3847     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3848         if ( zSig1 == 0 ) zSig1 = 1;
3849         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3850         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3851         mul64To128( zSig1, zSig1, &term2, &term3 );
3852         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3853         while ( (sbits64) rem1 < 0 ) {
3854             --zSig1;
3855             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3856             term3 |= 1;
3857             term2 |= doubleZSig0;
3858             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3859         }
3860         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3861     }
3862     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3863     zSig0 |= doubleZSig0;
3864     return
3865         roundAndPackFloatx80(
3866             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3867
3868 }
3869
3870 /*----------------------------------------------------------------------------
3871 | Returns 1 if the extended double-precision floating-point value `a' is
3872 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3873 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3874 | Arithmetic.
3875 *----------------------------------------------------------------------------*/
3876
3877 flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3878 {
3879
3880     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3881               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3882          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3883               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3884        ) {
3885         if (    floatx80_is_signaling_nan( a )
3886              || floatx80_is_signaling_nan( b ) ) {
3887             float_raise( float_flag_invalid STATUS_VAR);
3888         }
3889         return 0;
3890     }
3891     return
3892            ( a.low == b.low )
3893         && (    ( a.high == b.high )
3894              || (    ( a.low == 0 )
3895                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3896            );
3897
3898 }
3899
3900 /*----------------------------------------------------------------------------
3901 | Returns 1 if the extended double-precision floating-point value `a' is
3902 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3903 | comparison is performed according to the IEC/IEEE Standard for Binary
3904 | Floating-Point Arithmetic.
3905 *----------------------------------------------------------------------------*/
3906
3907 flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3908 {
3909     flag aSign, bSign;
3910
3911     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3912               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3913          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3914               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3915        ) {
3916         float_raise( float_flag_invalid STATUS_VAR);
3917         return 0;
3918     }
3919     aSign = extractFloatx80Sign( a );
3920     bSign = extractFloatx80Sign( b );
3921     if ( aSign != bSign ) {
3922         return
3923                aSign
3924             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3925                  == 0 );
3926     }
3927     return
3928           aSign ? le128( b.high, b.low, a.high, a.low )
3929         : le128( a.high, a.low, b.high, b.low );
3930
3931 }
3932
3933 /*----------------------------------------------------------------------------
3934 | Returns 1 if the extended double-precision floating-point value `a' is
3935 | less than the corresponding value `b', and 0 otherwise.  The comparison
3936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3937 | Arithmetic.
3938 *----------------------------------------------------------------------------*/
3939
3940 flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3941 {
3942     flag aSign, bSign;
3943
3944     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3945               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3946          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3947               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3948        ) {
3949         float_raise( float_flag_invalid STATUS_VAR);
3950         return 0;
3951     }
3952     aSign = extractFloatx80Sign( a );
3953     bSign = extractFloatx80Sign( b );
3954     if ( aSign != bSign ) {
3955         return
3956                aSign
3957             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3958                  != 0 );
3959     }
3960     return
3961           aSign ? lt128( b.high, b.low, a.high, a.low )
3962         : lt128( a.high, a.low, b.high, b.low );
3963
3964 }
3965
3966 /*----------------------------------------------------------------------------
3967 | Returns 1 if the extended double-precision floating-point value `a' is equal
3968 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3969 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3970 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3972
3973 flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3974 {
3975
3976     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3977               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3978          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3979               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3980        ) {
3981         float_raise( float_flag_invalid STATUS_VAR);
3982         return 0;
3983     }
3984     return
3985            ( a.low == b.low )
3986         && (    ( a.high == b.high )
3987              || (    ( a.low == 0 )
3988                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3989            );
3990
3991 }
3992
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the extended double-precision floating-point value `a' is less
3995 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3996 | do not cause an exception.  Otherwise, the comparison is performed according
3997 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999
4000 flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4001 {
4002     flag aSign, bSign;
4003
4004     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4005               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4006          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4007               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4008        ) {
4009         if (    floatx80_is_signaling_nan( a )
4010              || floatx80_is_signaling_nan( b ) ) {
4011             float_raise( float_flag_invalid STATUS_VAR);
4012         }
4013         return 0;
4014     }
4015     aSign = extractFloatx80Sign( a );
4016     bSign = extractFloatx80Sign( b );
4017     if ( aSign != bSign ) {
4018         return
4019                aSign
4020             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4021                  == 0 );
4022     }
4023     return
4024           aSign ? le128( b.high, b.low, a.high, a.low )
4025         : le128( a.high, a.low, b.high, b.low );
4026
4027 }
4028
4029 /*----------------------------------------------------------------------------
4030 | Returns 1 if the extended double-precision floating-point value `a' is less
4031 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4032 | an exception.  Otherwise, the comparison is performed according to the
4033 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4034 *----------------------------------------------------------------------------*/
4035
4036 flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4037 {
4038     flag aSign, bSign;
4039
4040     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4041               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4042          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4043               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4044        ) {
4045         if (    floatx80_is_signaling_nan( a )
4046              || floatx80_is_signaling_nan( b ) ) {
4047             float_raise( float_flag_invalid STATUS_VAR);
4048         }
4049         return 0;
4050     }
4051     aSign = extractFloatx80Sign( a );
4052     bSign = extractFloatx80Sign( b );
4053     if ( aSign != bSign ) {
4054         return
4055                aSign
4056             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4057                  != 0 );
4058     }
4059     return
4060           aSign ? lt128( b.high, b.low, a.high, a.low )
4061         : lt128( a.high, a.low, b.high, b.low );
4062
4063 }
4064
4065 #endif
4066
4067 #ifdef FLOAT128
4068
4069 /*----------------------------------------------------------------------------
4070 | Returns the result of converting the quadruple-precision floating-point
4071 | value `a' to the 32-bit two's complement integer format.  The conversion
4072 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4073 | Arithmetic---which means in particular that the conversion is rounded
4074 | according to the current rounding mode.  If `a' is a NaN, the largest
4075 | positive integer is returned.  Otherwise, if the conversion overflows, the
4076 | largest integer with the same sign as `a' is returned.
4077 *----------------------------------------------------------------------------*/
4078
4079 int32 float128_to_int32( float128 a STATUS_PARAM )
4080 {
4081     flag aSign;
4082     int32 aExp, shiftCount;
4083     bits64 aSig0, aSig1;
4084
4085     aSig1 = extractFloat128Frac1( a );
4086     aSig0 = extractFloat128Frac0( a );
4087     aExp = extractFloat128Exp( a );
4088     aSign = extractFloat128Sign( a );
4089     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4090     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4091     aSig0 |= ( aSig1 != 0 );
4092     shiftCount = 0x4028 - aExp;
4093     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4094     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4095
4096 }
4097
4098 /*----------------------------------------------------------------------------
4099 | Returns the result of converting the quadruple-precision floating-point
4100 | value `a' to the 32-bit two's complement integer format.  The conversion
4101 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4102 | Arithmetic, except that the conversion is always rounded toward zero.  If
4103 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4104 | conversion overflows, the largest integer with the same sign as `a' is
4105 | returned.
4106 *----------------------------------------------------------------------------*/
4107
4108 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4109 {
4110     flag aSign;
4111     int32 aExp, shiftCount;
4112     bits64 aSig0, aSig1, savedASig;
4113     int32 z;
4114
4115     aSig1 = extractFloat128Frac1( a );
4116     aSig0 = extractFloat128Frac0( a );
4117     aExp = extractFloat128Exp( a );
4118     aSign = extractFloat128Sign( a );
4119     aSig0 |= ( aSig1 != 0 );
4120     if ( 0x401E < aExp ) {
4121         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4122         goto invalid;
4123     }
4124     else if ( aExp < 0x3FFF ) {
4125         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4126         return 0;
4127     }
4128     aSig0 |= LIT64( 0x0001000000000000 );
4129     shiftCount = 0x402F - aExp;
4130     savedASig = aSig0;
4131     aSig0 >>= shiftCount;
4132     z = aSig0;
4133     if ( aSign ) z = - z;
4134     if ( ( z < 0 ) ^ aSign ) {
4135  invalid:
4136         float_raise( float_flag_invalid STATUS_VAR);
4137         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4138     }
4139     if ( ( aSig0<<shiftCount ) != savedASig ) {
4140         STATUS(float_exception_flags) |= float_flag_inexact;
4141     }
4142     return z;
4143
4144 }
4145
4146 /*----------------------------------------------------------------------------
4147 | Returns the result of converting the quadruple-precision floating-point
4148 | value `a' to the 64-bit two's complement integer format.  The conversion
4149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4150 | Arithmetic---which means in particular that the conversion is rounded
4151 | according to the current rounding mode.  If `a' is a NaN, the largest
4152 | positive integer is returned.  Otherwise, if the conversion overflows, the
4153 | largest integer with the same sign as `a' is returned.
4154 *----------------------------------------------------------------------------*/
4155
4156 int64 float128_to_int64( float128 a STATUS_PARAM )
4157 {
4158     flag aSign;
4159     int32 aExp, shiftCount;
4160     bits64 aSig0, aSig1;
4161
4162     aSig1 = extractFloat128Frac1( a );
4163     aSig0 = extractFloat128Frac0( a );
4164     aExp = extractFloat128Exp( a );
4165     aSign = extractFloat128Sign( a );
4166     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4167     shiftCount = 0x402F - aExp;
4168     if ( shiftCount <= 0 ) {
4169         if ( 0x403E < aExp ) {
4170             float_raise( float_flag_invalid STATUS_VAR);
4171             if (    ! aSign
4172                  || (    ( aExp == 0x7FFF )
4173                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4174                     )
4175                ) {
4176                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4177             }
4178             return (sbits64) LIT64( 0x8000000000000000 );
4179         }
4180         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4181     }
4182     else {
4183         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4184     }
4185     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4186
4187 }
4188
4189 /*----------------------------------------------------------------------------
4190 | Returns the result of converting the quadruple-precision floating-point
4191 | value `a' to the 64-bit two's complement integer format.  The conversion
4192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4193 | Arithmetic, except that the conversion is always rounded toward zero.
4194 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4195 | the conversion overflows, the largest integer with the same sign as `a' is
4196 | returned.
4197 *----------------------------------------------------------------------------*/
4198
4199 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4200 {
4201     flag aSign;
4202     int32 aExp, shiftCount;
4203     bits64 aSig0, aSig1;
4204     int64 z;
4205
4206     aSig1 = extractFloat128Frac1( a );
4207     aSig0 = extractFloat128Frac0( a );
4208     aExp = extractFloat128Exp( a );
4209     aSign = extractFloat128Sign( a );
4210     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4211     shiftCount = aExp - 0x402F;
4212     if ( 0 < shiftCount ) {
4213         if ( 0x403E <= aExp ) {
4214             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4215             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4216                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4217                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4218             }
4219             else {
4220                 float_raise( float_flag_invalid STATUS_VAR);
4221                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4222                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4223                 }
4224             }
4225             return (sbits64) LIT64( 0x8000000000000000 );
4226         }
4227         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4228         if ( (bits64) ( aSig1<<shiftCount ) ) {
4229             STATUS(float_exception_flags) |= float_flag_inexact;
4230         }
4231     }
4232     else {
4233         if ( aExp < 0x3FFF ) {
4234             if ( aExp | aSig0 | aSig1 ) {
4235                 STATUS(float_exception_flags) |= float_flag_inexact;
4236             }
4237             return 0;
4238         }
4239         z = aSig0>>( - shiftCount );
4240         if (    aSig1
4241              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4242             STATUS(float_exception_flags) |= float_flag_inexact;
4243         }
4244     }
4245     if ( aSign ) z = - z;
4246     return z;
4247
4248 }
4249
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of converting the quadruple-precision floating-point
4252 | value `a' to the single-precision floating-point format.  The conversion
4253 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4254 | Arithmetic.
4255 *----------------------------------------------------------------------------*/
4256
4257 float32 float128_to_float32( float128 a STATUS_PARAM )
4258 {
4259     flag aSign;
4260     int32 aExp;
4261     bits64 aSig0, aSig1;
4262     bits32 zSig;
4263
4264     aSig1 = extractFloat128Frac1( a );
4265     aSig0 = extractFloat128Frac0( a );
4266     aExp = extractFloat128Exp( a );
4267     aSign = extractFloat128Sign( a );
4268     if ( aExp == 0x7FFF ) {
4269         if ( aSig0 | aSig1 ) {
4270             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4271         }
4272         return packFloat32( aSign, 0xFF, 0 );
4273     }
4274     aSig0 |= ( aSig1 != 0 );
4275     shift64RightJamming( aSig0, 18, &aSig0 );
4276     zSig = aSig0;
4277     if ( aExp || zSig ) {
4278         zSig |= 0x40000000;
4279         aExp -= 0x3F81;
4280     }
4281     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4282
4283 }
4284
4285 /*----------------------------------------------------------------------------
4286 | Returns the result of converting the quadruple-precision floating-point
4287 | value `a' to the double-precision floating-point format.  The conversion
4288 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4289 | Arithmetic.
4290 *----------------------------------------------------------------------------*/
4291
4292 float64 float128_to_float64( float128 a STATUS_PARAM )
4293 {
4294     flag aSign;
4295     int32 aExp;
4296     bits64 aSig0, aSig1;
4297
4298     aSig1 = extractFloat128Frac1( a );
4299     aSig0 = extractFloat128Frac0( a );
4300     aExp = extractFloat128Exp( a );
4301     aSign = extractFloat128Sign( a );
4302     if ( aExp == 0x7FFF ) {
4303         if ( aSig0 | aSig1 ) {
4304             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4305         }
4306         return packFloat64( aSign, 0x7FF, 0 );
4307     }
4308     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4309     aSig0 |= ( aSig1 != 0 );
4310     if ( aExp || aSig0 ) {
4311         aSig0 |= LIT64( 0x4000000000000000 );
4312         aExp -= 0x3C01;
4313     }
4314     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4315
4316 }
4317
4318 #ifdef FLOATX80
4319
4320 /*----------------------------------------------------------------------------
4321 | Returns the result of converting the quadruple-precision floating-point
4322 | value `a' to the extended double-precision floating-point format.  The
4323 | conversion is performed according to the IEC/IEEE Standard for Binary
4324 | Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4326
4327 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4328 {
4329     flag aSign;
4330     int32 aExp;
4331     bits64 aSig0, aSig1;
4332
4333     aSig1 = extractFloat128Frac1( a );
4334     aSig0 = extractFloat128Frac0( a );
4335     aExp = extractFloat128Exp( a );
4336     aSign = extractFloat128Sign( a );
4337     if ( aExp == 0x7FFF ) {
4338         if ( aSig0 | aSig1 ) {
4339             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4340         }
4341         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4342     }
4343     if ( aExp == 0 ) {
4344         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4345         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4346     }
4347     else {
4348         aSig0 |= LIT64( 0x0001000000000000 );
4349     }
4350     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4351     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4352
4353 }
4354
4355 #endif
4356
4357 /*----------------------------------------------------------------------------
4358 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4359 | returns the result as a quadruple-precision floating-point value.  The
4360 | operation is performed according to the IEC/IEEE Standard for Binary
4361 | Floating-Point Arithmetic.
4362 *----------------------------------------------------------------------------*/
4363
4364 float128 float128_round_to_int( float128 a STATUS_PARAM )
4365 {
4366     flag aSign;
4367     int32 aExp;
4368     bits64 lastBitMask, roundBitsMask;
4369     int8 roundingMode;
4370     float128 z;
4371
4372     aExp = extractFloat128Exp( a );
4373     if ( 0x402F <= aExp ) {
4374         if ( 0x406F <= aExp ) {
4375             if (    ( aExp == 0x7FFF )
4376                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4377                ) {
4378                 return propagateFloat128NaN( a, a STATUS_VAR );
4379             }
4380             return a;
4381         }
4382         lastBitMask = 1;
4383         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4384         roundBitsMask = lastBitMask - 1;
4385         z = a;
4386         roundingMode = STATUS(float_rounding_mode);
4387         if ( roundingMode == float_round_nearest_even ) {
4388             if ( lastBitMask ) {
4389                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4390                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4391             }
4392             else {
4393                 if ( (sbits64) z.low < 0 ) {
4394                     ++z.high;
4395                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4396                 }
4397             }
4398         }
4399         else if ( roundingMode != float_round_to_zero ) {
4400             if (   extractFloat128Sign( z )
4401                  ^ ( roundingMode == float_round_up ) ) {
4402                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4403             }
4404         }
4405         z.low &= ~ roundBitsMask;
4406     }
4407     else {
4408         if ( aExp < 0x3FFF ) {
4409             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4410             STATUS(float_exception_flags) |= float_flag_inexact;
4411             aSign = extractFloat128Sign( a );
4412             switch ( STATUS(float_rounding_mode) ) {
4413              case float_round_nearest_even:
4414                 if (    ( aExp == 0x3FFE )
4415                      && (   extractFloat128Frac0( a )
4416                           | extractFloat128Frac1( a ) )
4417                    ) {
4418                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4419                 }
4420                 break;
4421              case float_round_down:
4422                 return
4423                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4424                     : packFloat128( 0, 0, 0, 0 );
4425              case float_round_up:
4426                 return
4427                       aSign ? packFloat128( 1, 0, 0, 0 )
4428                     : packFloat128( 0, 0x3FFF, 0, 0 );
4429             }
4430             return packFloat128( aSign, 0, 0, 0 );
4431         }
4432         lastBitMask = 1;
4433         lastBitMask <<= 0x402F - aExp;
4434         roundBitsMask = lastBitMask - 1;
4435         z.low = 0;
4436         z.high = a.high;
4437         roundingMode = STATUS(float_rounding_mode);
4438         if ( roundingMode == float_round_nearest_even ) {
4439             z.high += lastBitMask>>1;
4440             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4441                 z.high &= ~ lastBitMask;
4442             }
4443         }
4444         else if ( roundingMode != float_round_to_zero ) {
4445             if (   extractFloat128Sign( z )
4446                  ^ ( roundingMode == float_round_up ) ) {
4447                 z.high |= ( a.low != 0 );
4448                 z.high += roundBitsMask;
4449             }
4450         }
4451         z.high &= ~ roundBitsMask;
4452     }
4453     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4454         STATUS(float_exception_flags) |= float_flag_inexact;
4455     }
4456     return z;
4457
4458 }
4459
4460 /*----------------------------------------------------------------------------
4461 | Returns the result of adding the absolute values of the quadruple-precision
4462 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4463 | before being returned.  `zSign' is ignored if the result is a NaN.
4464 | The addition is performed according to the IEC/IEEE Standard for Binary
4465 | Floating-Point Arithmetic.
4466 *----------------------------------------------------------------------------*/
4467
4468 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4469 {
4470     int32 aExp, bExp, zExp;
4471     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4472     int32 expDiff;
4473
4474     aSig1 = extractFloat128Frac1( a );
4475     aSig0 = extractFloat128Frac0( a );
4476     aExp = extractFloat128Exp( a );
4477     bSig1 = extractFloat128Frac1( b );
4478     bSig0 = extractFloat128Frac0( b );
4479     bExp = extractFloat128Exp( b );
4480     expDiff = aExp - bExp;
4481     if ( 0 < expDiff ) {
4482         if ( aExp == 0x7FFF ) {
4483             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4484             return a;
4485         }
4486         if ( bExp == 0 ) {
4487             --expDiff;
4488         }
4489         else {
4490             bSig0 |= LIT64( 0x0001000000000000 );
4491         }
4492         shift128ExtraRightJamming(
4493             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4494         zExp = aExp;
4495     }
4496     else if ( expDiff < 0 ) {
4497         if ( bExp == 0x7FFF ) {
4498             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4499             return packFloat128( zSign, 0x7FFF, 0, 0 );
4500         }
4501         if ( aExp == 0 ) {
4502             ++expDiff;
4503         }
4504         else {
4505             aSig0 |= LIT64( 0x0001000000000000 );
4506         }
4507         shift128ExtraRightJamming(
4508             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4509         zExp = bExp;
4510     }
4511     else {
4512         if ( aExp == 0x7FFF ) {
4513             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4514                 return propagateFloat128NaN( a, b STATUS_VAR );
4515             }
4516             return a;
4517         }
4518         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4519         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4520         zSig2 = 0;
4521         zSig0 |= LIT64( 0x0002000000000000 );
4522         zExp = aExp;
4523         goto shiftRight1;
4524     }
4525     aSig0 |= LIT64( 0x0001000000000000 );
4526     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4527     --zExp;
4528     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4529     ++zExp;
4530  shiftRight1:
4531     shift128ExtraRightJamming(
4532         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4533  roundAndPack:
4534     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4535
4536 }
4537
4538 /*----------------------------------------------------------------------------
4539 | Returns the result of subtracting the absolute values of the quadruple-
4540 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4541 | difference is negated before being returned.  `zSign' is ignored if the
4542 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4543 | Standard for Binary Floating-Point Arithmetic.
4544 *----------------------------------------------------------------------------*/
4545
4546 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4547 {
4548     int32 aExp, bExp, zExp;
4549     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4550     int32 expDiff;
4551     float128 z;
4552
4553     aSig1 = extractFloat128Frac1( a );
4554     aSig0 = extractFloat128Frac0( a );
4555     aExp = extractFloat128Exp( a );
4556     bSig1 = extractFloat128Frac1( b );
4557     bSig0 = extractFloat128Frac0( b );
4558     bExp = extractFloat128Exp( b );
4559     expDiff = aExp - bExp;
4560     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4561     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4562     if ( 0 < expDiff ) goto aExpBigger;
4563     if ( expDiff < 0 ) goto bExpBigger;
4564     if ( aExp == 0x7FFF ) {
4565         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4566             return propagateFloat128NaN( a, b STATUS_VAR );
4567         }
4568         float_raise( float_flag_invalid STATUS_VAR);
4569         z.low = float128_default_nan_low;
4570         z.high = float128_default_nan_high;
4571         return z;
4572     }
4573     if ( aExp == 0 ) {
4574         aExp = 1;
4575         bExp = 1;
4576     }
4577     if ( bSig0 < aSig0 ) goto aBigger;
4578     if ( aSig0 < bSig0 ) goto bBigger;
4579     if ( bSig1 < aSig1 ) goto aBigger;
4580     if ( aSig1 < bSig1 ) goto bBigger;
4581     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4582  bExpBigger:
4583     if ( bExp == 0x7FFF ) {
4584         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4585         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4586     }
4587     if ( aExp == 0 ) {
4588         ++expDiff;
4589     }
4590     else {
4591         aSig0 |= LIT64( 0x4000000000000000 );
4592     }
4593     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4594     bSig0 |= LIT64( 0x4000000000000000 );
4595  bBigger:
4596     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4597     zExp = bExp;
4598     zSign ^= 1;
4599     goto normalizeRoundAndPack;
4600  aExpBigger:
4601     if ( aExp == 0x7FFF ) {
4602         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4603         return a;
4604     }
4605     if ( bExp == 0 ) {
4606         --expDiff;
4607     }
4608     else {
4609         bSig0 |= LIT64( 0x4000000000000000 );
4610     }
4611     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4612     aSig0 |= LIT64( 0x4000000000000000 );
4613  aBigger:
4614     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4615     zExp = aExp;
4616  normalizeRoundAndPack:
4617     --zExp;
4618     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4619
4620 }
4621
4622 /*----------------------------------------------------------------------------
4623 | Returns the result of adding the quadruple-precision floating-point values
4624 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4625 | for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4627
4628 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4629 {
4630     flag aSign, bSign;
4631
4632     aSign = extractFloat128Sign( a );
4633     bSign = extractFloat128Sign( b );
4634     if ( aSign == bSign ) {
4635         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4636     }
4637     else {
4638         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4639     }
4640
4641 }
4642
4643 /*----------------------------------------------------------------------------
4644 | Returns the result of subtracting the quadruple-precision floating-point
4645 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4646 | Standard for Binary Floating-Point Arithmetic.
4647 *----------------------------------------------------------------------------*/
4648
4649 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4650 {
4651     flag aSign, bSign;
4652
4653     aSign = extractFloat128Sign( a );
4654     bSign = extractFloat128Sign( b );
4655     if ( aSign == bSign ) {
4656         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4657     }
4658     else {
4659         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4660     }
4661
4662 }
4663
4664 /*----------------------------------------------------------------------------
4665 | Returns the result of multiplying the quadruple-precision floating-point
4666 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4667 | Standard for Binary Floating-Point Arithmetic.
4668 *----------------------------------------------------------------------------*/
4669
4670 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4671 {
4672     flag aSign, bSign, zSign;
4673     int32 aExp, bExp, zExp;
4674     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4675     float128 z;
4676
4677     aSig1 = extractFloat128Frac1( a );
4678     aSig0 = extractFloat128Frac0( a );
4679     aExp = extractFloat128Exp( a );
4680     aSign = extractFloat128Sign( a );
4681     bSig1 = extractFloat128Frac1( b );
4682     bSig0 = extractFloat128Frac0( b );
4683     bExp = extractFloat128Exp( b );
4684     bSign = extractFloat128Sign( b );
4685     zSign = aSign ^ bSign;
4686     if ( aExp == 0x7FFF ) {
4687         if (    ( aSig0 | aSig1 )
4688              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4689             return propagateFloat128NaN( a, b STATUS_VAR );
4690         }
4691         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4692         return packFloat128( zSign, 0x7FFF, 0, 0 );
4693     }
4694     if ( bExp == 0x7FFF ) {
4695         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4696         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4697  invalid:
4698             float_raise( float_flag_invalid STATUS_VAR);
4699             z.low = float128_default_nan_low;
4700             z.high = float128_default_nan_high;
4701             return z;
4702         }
4703         return packFloat128( zSign, 0x7FFF, 0, 0 );
4704     }
4705     if ( aExp == 0 ) {
4706         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4707         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4708     }
4709     if ( bExp == 0 ) {
4710         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4711         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4712     }
4713     zExp = aExp + bExp - 0x4000;
4714     aSig0 |= LIT64( 0x0001000000000000 );
4715     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4716     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4717     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4718     zSig2 |= ( zSig3 != 0 );
4719     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4720         shift128ExtraRightJamming(
4721             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4722         ++zExp;
4723     }
4724     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4725
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of dividing the quadruple-precision floating-point value
4730 | `a' by the corresponding value `b'.  The operation is performed according to
4731 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4732 *----------------------------------------------------------------------------*/
4733
4734 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4735 {
4736     flag aSign, bSign, zSign;
4737     int32 aExp, bExp, zExp;
4738     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4739     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4740     float128 z;
4741
4742     aSig1 = extractFloat128Frac1( a );
4743     aSig0 = extractFloat128Frac0( a );
4744     aExp = extractFloat128Exp( a );
4745     aSign = extractFloat128Sign( a );
4746     bSig1 = extractFloat128Frac1( b );
4747     bSig0 = extractFloat128Frac0( b );
4748     bExp = extractFloat128Exp( b );
4749     bSign = extractFloat128Sign( b );
4750     zSign = aSign ^ bSign;
4751     if ( aExp == 0x7FFF ) {
4752         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4753         if ( bExp == 0x7FFF ) {
4754             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4755             goto invalid;
4756         }
4757         return packFloat128( zSign, 0x7FFF, 0, 0 );
4758     }
4759     if ( bExp == 0x7FFF ) {
4760         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4761         return packFloat128( zSign, 0, 0, 0 );
4762     }
4763     if ( bExp == 0 ) {
4764         if ( ( bSig0 | bSig1 ) == 0 ) {
4765             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4766  invalid:
4767                 float_raise( float_flag_invalid STATUS_VAR);
4768                 z.low = float128_default_nan_low;
4769                 z.high = float128_default_nan_high;
4770                 return z;
4771             }
4772             float_raise( float_flag_divbyzero STATUS_VAR);
4773             return packFloat128( zSign, 0x7FFF, 0, 0 );
4774         }
4775         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4776     }
4777     if ( aExp == 0 ) {
4778         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4779         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4780     }
4781     zExp = aExp - bExp + 0x3FFD;
4782     shortShift128Left(
4783         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4784     shortShift128Left(
4785         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4786     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4787         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4788         ++zExp;
4789     }
4790     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4791     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4792     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4793     while ( (sbits64) rem0 < 0 ) {
4794         --zSig0;
4795         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4796     }
4797     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4798     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4799         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4800         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4801         while ( (sbits64) rem1 < 0 ) {
4802             --zSig1;
4803             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4804         }
4805         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4806     }
4807     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4808     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4809
4810 }
4811
4812 /*----------------------------------------------------------------------------
4813 | Returns the remainder of the quadruple-precision floating-point value `a'
4814 | with respect to the corresponding value `b'.  The operation is performed
4815 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4817
4818 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4819 {
4820     flag aSign, bSign, zSign;
4821     int32 aExp, bExp, expDiff;
4822     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4823     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4824     sbits64 sigMean0;
4825     float128 z;
4826
4827     aSig1 = extractFloat128Frac1( a );
4828     aSig0 = extractFloat128Frac0( a );
4829     aExp = extractFloat128Exp( a );
4830     aSign = extractFloat128Sign( a );
4831     bSig1 = extractFloat128Frac1( b );
4832     bSig0 = extractFloat128Frac0( b );
4833     bExp = extractFloat128Exp( b );
4834     bSign = extractFloat128Sign( b );
4835     if ( aExp == 0x7FFF ) {
4836         if (    ( aSig0 | aSig1 )
4837              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4838             return propagateFloat128NaN( a, b STATUS_VAR );
4839         }
4840         goto invalid;
4841     }
4842     if ( bExp == 0x7FFF ) {
4843         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4844         return a;
4845     }
4846     if ( bExp == 0 ) {
4847         if ( ( bSig0 | bSig1 ) == 0 ) {
4848  invalid:
4849             float_raise( float_flag_invalid STATUS_VAR);
4850             z.low = float128_default_nan_low;
4851             z.high = float128_default_nan_high;
4852             return z;
4853         }
4854         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4855     }
4856     if ( aExp == 0 ) {
4857         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4858         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4859     }
4860     expDiff = aExp - bExp;
4861     if ( expDiff < -1 ) return a;
4862     shortShift128Left(
4863         aSig0 | LIT64( 0x0001000000000000 ),
4864         aSig1,
4865         15 - ( expDiff < 0 ),
4866         &aSig0,
4867         &aSig1
4868     );
4869     shortShift128Left(
4870         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4871     q = le128( bSig0, bSig1, aSig0, aSig1 );
4872     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4873     expDiff -= 64;
4874     while ( 0 < expDiff ) {
4875         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4876         q = ( 4 < q ) ? q - 4 : 0;
4877         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4878         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4879         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4880         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4881         expDiff -= 61;
4882     }
4883     if ( -64 < expDiff ) {
4884         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4885         q = ( 4 < q ) ? q - 4 : 0;
4886         q >>= - expDiff;
4887         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4888         expDiff += 52;
4889         if ( expDiff < 0 ) {
4890             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4891         }
4892         else {
4893             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4894         }
4895         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4896         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4897     }
4898     else {
4899         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4900         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4901     }
4902     do {
4903         alternateASig0 = aSig0;
4904         alternateASig1 = aSig1;
4905         ++q;
4906         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4907     } while ( 0 <= (sbits64) aSig0 );
4908     add128(
4909         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4910     if (    ( sigMean0 < 0 )
4911          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4912         aSig0 = alternateASig0;
4913         aSig1 = alternateASig1;
4914     }
4915     zSign = ( (sbits64) aSig0 < 0 );
4916     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4917     return
4918         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4919
4920 }
4921
4922 /*----------------------------------------------------------------------------
4923 | Returns the square root of the quadruple-precision floating-point value `a'.
4924 | The operation is performed according to the IEC/IEEE Standard for Binary
4925 | Floating-Point Arithmetic.
4926 *----------------------------------------------------------------------------*/
4927
4928 float128 float128_sqrt( float128 a STATUS_PARAM )
4929 {
4930     flag aSign;
4931     int32 aExp, zExp;
4932     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4933     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4934     float128 z;
4935
4936     aSig1 = extractFloat128Frac1( a );
4937     aSig0 = extractFloat128Frac0( a );
4938     aExp = extractFloat128Exp( a );
4939     aSign = extractFloat128Sign( a );
4940     if ( aExp == 0x7FFF ) {
4941         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4942         if ( ! aSign ) return a;
4943         goto invalid;
4944     }
4945     if ( aSign ) {
4946         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4947  invalid:
4948         float_raise( float_flag_invalid STATUS_VAR);
4949         z.low = float128_default_nan_low;
4950         z.high = float128_default_nan_high;
4951         return z;
4952     }
4953     if ( aExp == 0 ) {
4954         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4955         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4956     }
4957     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4958     aSig0 |= LIT64( 0x0001000000000000 );
4959     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4960     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4961     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4962     doubleZSig0 = zSig0<<1;
4963     mul64To128( zSig0, zSig0, &term0, &term1 );
4964     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4965     while ( (sbits64) rem0 < 0 ) {
4966         --zSig0;
4967         doubleZSig0 -= 2;
4968         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4969     }
4970     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4971     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4972         if ( zSig1 == 0 ) zSig1 = 1;
4973         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4974         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4975         mul64To128( zSig1, zSig1, &term2, &term3 );
4976         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4977         while ( (sbits64) rem1 < 0 ) {
4978             --zSig1;
4979             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4980             term3 |= 1;
4981             term2 |= doubleZSig0;
4982             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4983         }
4984         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4985     }
4986     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4987     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4988
4989 }
4990
4991 /*----------------------------------------------------------------------------
4992 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4993 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4994 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4996
4997 flag float128_eq( float128 a, float128 b STATUS_PARAM )
4998 {
4999
5000     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5001               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5002          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5003               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5004        ) {
5005         if (    float128_is_signaling_nan( a )
5006              || float128_is_signaling_nan( b ) ) {
5007             float_raise( float_flag_invalid STATUS_VAR);
5008         }
5009         return 0;
5010     }
5011     return
5012            ( a.low == b.low )
5013         && (    ( a.high == b.high )
5014              || (    ( a.low == 0 )
5015                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5016            );
5017
5018 }
5019
5020 /*----------------------------------------------------------------------------
5021 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5022 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5024 | Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026
5027 flag float128_le( float128 a, float128 b STATUS_PARAM )
5028 {
5029     flag aSign, bSign;
5030
5031     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5032               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5033          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5034               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5035        ) {
5036         float_raise( float_flag_invalid STATUS_VAR);
5037         return 0;
5038     }
5039     aSign = extractFloat128Sign( a );
5040     bSign = extractFloat128Sign( b );
5041     if ( aSign != bSign ) {
5042         return
5043                aSign
5044             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5045                  == 0 );
5046     }
5047     return
5048           aSign ? le128( b.high, b.low, a.high, a.low )
5049         : le128( a.high, a.low, b.high, b.low );
5050
5051 }
5052
5053 /*----------------------------------------------------------------------------
5054 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5055 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5056 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5057 *----------------------------------------------------------------------------*/
5058
5059 flag float128_lt( float128 a, float128 b STATUS_PARAM )
5060 {
5061     flag aSign, bSign;
5062
5063     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5064               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5065          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5066               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5067        ) {
5068         float_raise( float_flag_invalid STATUS_VAR);
5069         return 0;
5070     }
5071     aSign = extractFloat128Sign( a );
5072     bSign = extractFloat128Sign( b );
5073     if ( aSign != bSign ) {
5074         return
5075                aSign
5076             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5077                  != 0 );
5078     }
5079     return
5080           aSign ? lt128( b.high, b.low, a.high, a.low )
5081         : lt128( a.high, a.low, b.high, b.low );
5082
5083 }
5084
5085 /*----------------------------------------------------------------------------
5086 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5087 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5088 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5091
5092 flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5093 {
5094
5095     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5096               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5097          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5098               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5099        ) {
5100         float_raise( float_flag_invalid STATUS_VAR);
5101         return 0;
5102     }
5103     return
5104            ( a.low == b.low )
5105         && (    ( a.high == b.high )
5106              || (    ( a.low == 0 )
5107                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5108            );
5109
5110 }
5111
5112 /*----------------------------------------------------------------------------
5113 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5114 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5115 | cause an exception.  Otherwise, the comparison is performed according to the
5116 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 *----------------------------------------------------------------------------*/
5118
5119 flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5120 {
5121     flag aSign, bSign;
5122
5123     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5124               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5125          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5126               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5127        ) {
5128         if (    float128_is_signaling_nan( a )
5129              || float128_is_signaling_nan( b ) ) {
5130             float_raise( float_flag_invalid STATUS_VAR);
5131         }
5132         return 0;
5133     }
5134     aSign = extractFloat128Sign( a );
5135     bSign = extractFloat128Sign( b );
5136     if ( aSign != bSign ) {
5137         return
5138                aSign
5139             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5140                  == 0 );
5141     }
5142     return
5143           aSign ? le128( b.high, b.low, a.high, a.low )
5144         : le128( a.high, a.low, b.high, b.low );
5145
5146 }
5147
5148 /*----------------------------------------------------------------------------
5149 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5150 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5151 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5152 | Standard for Binary Floating-Point Arithmetic.
5153 *----------------------------------------------------------------------------*/
5154
5155 flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5156 {
5157     flag aSign, bSign;
5158
5159     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5160               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5161          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5162               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5163        ) {
5164         if (    float128_is_signaling_nan( a )
5165              || float128_is_signaling_nan( b ) ) {
5166             float_raise( float_flag_invalid STATUS_VAR);
5167         }
5168         return 0;
5169     }
5170     aSign = extractFloat128Sign( a );
5171     bSign = extractFloat128Sign( b );
5172     if ( aSign != bSign ) {
5173         return
5174                aSign
5175             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5176                  != 0 );
5177     }
5178     return
5179           aSign ? lt128( b.high, b.low, a.high, a.low )
5180         : lt128( a.high, a.low, b.high, b.low );
5181
5182 }
5183
5184 #endif
5185