2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
58 void set_floatx80_rounding_precision(int val STATUS_PARAM)
60 STATUS(floatx80_rounding_precision) = val;
64 /*----------------------------------------------------------------------------
65 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
66 | and 7, and returns the properly rounded 32-bit integer corresponding to the
67 | input. If `zSign' is 1, the input is negated before being converted to an
68 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
69 | is simply rounded to an integer, with the inexact exception raised if the
70 | input cannot be represented exactly as an integer. However, if the fixed-
71 | point input is too large, the invalid exception is raised and the largest
72 | positive or negative integer is returned.
73 *----------------------------------------------------------------------------*/
75 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
78 flag roundNearestEven;
79 int8 roundIncrement, roundBits;
82 roundingMode = STATUS(float_rounding_mode);
83 roundNearestEven = ( roundingMode == float_round_nearest_even );
84 roundIncrement = 0x40;
85 if ( ! roundNearestEven ) {
86 if ( roundingMode == float_round_to_zero ) {
90 roundIncrement = 0x7F;
92 if ( roundingMode == float_round_up ) roundIncrement = 0;
95 if ( roundingMode == float_round_down ) roundIncrement = 0;
99 roundBits = absZ & 0x7F;
100 absZ = ( absZ + roundIncrement )>>7;
101 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
103 if ( zSign ) z = - z;
104 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
105 float_raise( float_flag_invalid STATUS_VAR);
106 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
108 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
113 /*----------------------------------------------------------------------------
114 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
115 | `absZ1', with binary point between bits 63 and 64 (between the input words),
116 | and returns the properly rounded 64-bit integer corresponding to the input.
117 | If `zSign' is 1, the input is negated before being converted to an integer.
118 | Ordinarily, the fixed-point input is simply rounded to an integer, with
119 | the inexact exception raised if the input cannot be represented exactly as
120 | an integer. However, if the fixed-point input is too large, the invalid
121 | exception is raised and the largest positive or negative integer is
123 *----------------------------------------------------------------------------*/
125 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
128 flag roundNearestEven, increment;
131 roundingMode = STATUS(float_rounding_mode);
132 roundNearestEven = ( roundingMode == float_round_nearest_even );
133 increment = ( (sbits64) absZ1 < 0 );
134 if ( ! roundNearestEven ) {
135 if ( roundingMode == float_round_to_zero ) {
140 increment = ( roundingMode == float_round_down ) && absZ1;
143 increment = ( roundingMode == float_round_up ) && absZ1;
149 if ( absZ0 == 0 ) goto overflow;
150 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
153 if ( zSign ) z = - z;
154 if ( z && ( ( z < 0 ) ^ zSign ) ) {
156 float_raise( float_flag_invalid STATUS_VAR);
158 zSign ? (sbits64) LIT64( 0x8000000000000000 )
159 : LIT64( 0x7FFFFFFFFFFFFFFF );
161 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
166 /*----------------------------------------------------------------------------
167 | Returns the fraction bits of the single-precision floating-point value `a'.
168 *----------------------------------------------------------------------------*/
170 INLINE bits32 extractFloat32Frac( float32 a )
173 return a & 0x007FFFFF;
177 /*----------------------------------------------------------------------------
178 | Returns the exponent bits of the single-precision floating-point value `a'.
179 *----------------------------------------------------------------------------*/
181 INLINE int16 extractFloat32Exp( float32 a )
184 return ( a>>23 ) & 0xFF;
188 /*----------------------------------------------------------------------------
189 | Returns the sign bit of the single-precision floating-point value `a'.
190 *----------------------------------------------------------------------------*/
192 INLINE flag extractFloat32Sign( float32 a )
199 /*----------------------------------------------------------------------------
200 | Normalizes the subnormal single-precision floating-point value represented
201 | by the denormalized significand `aSig'. The normalized exponent and
202 | significand are stored at the locations pointed to by `zExpPtr' and
203 | `zSigPtr', respectively.
204 *----------------------------------------------------------------------------*/
207 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
211 shiftCount = countLeadingZeros32( aSig ) - 8;
212 *zSigPtr = aSig<<shiftCount;
213 *zExpPtr = 1 - shiftCount;
217 /*----------------------------------------------------------------------------
218 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
219 | single-precision floating-point value, returning the result. After being
220 | shifted into the proper positions, the three fields are simply added
221 | together to form the result. This means that any integer portion of `zSig'
222 | will be added into the exponent. Since a properly normalized significand
223 | will have an integer portion equal to 1, the `zExp' input should be 1 less
224 | than the desired result exponent whenever `zSig' is a complete, normalized
226 *----------------------------------------------------------------------------*/
228 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
231 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
235 /*----------------------------------------------------------------------------
236 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
237 | and significand `zSig', and returns the proper single-precision floating-
238 | point value corresponding to the abstract input. Ordinarily, the abstract
239 | value is simply rounded and packed into the single-precision format, with
240 | the inexact exception raised if the abstract input cannot be represented
241 | exactly. However, if the abstract value is too large, the overflow and
242 | inexact exceptions are raised and an infinity or maximal finite value is
243 | returned. If the abstract value is too small, the input value is rounded to
244 | a subnormal number, and the underflow and inexact exceptions are raised if
245 | the abstract input cannot be represented exactly as a subnormal single-
246 | precision floating-point number.
247 | The input significand `zSig' has its binary point between bits 30
248 | and 29, which is 7 bits to the left of the usual location. This shifted
249 | significand must be normalized or smaller. If `zSig' is not normalized,
250 | `zExp' must be 0; in that case, the result returned is a subnormal number,
251 | and it must not require rounding. In the usual case that `zSig' is
252 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
253 | The handling of underflow and overflow follows the IEC/IEEE Standard for
254 | Binary Floating-Point Arithmetic.
255 *----------------------------------------------------------------------------*/
257 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
260 flag roundNearestEven;
261 int8 roundIncrement, roundBits;
264 roundingMode = STATUS(float_rounding_mode);
265 roundNearestEven = ( roundingMode == float_round_nearest_even );
266 roundIncrement = 0x40;
267 if ( ! roundNearestEven ) {
268 if ( roundingMode == float_round_to_zero ) {
272 roundIncrement = 0x7F;
274 if ( roundingMode == float_round_up ) roundIncrement = 0;
277 if ( roundingMode == float_round_down ) roundIncrement = 0;
281 roundBits = zSig & 0x7F;
282 if ( 0xFD <= (bits16) zExp ) {
284 || ( ( zExp == 0xFD )
285 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
287 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
288 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
292 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
294 || ( zSig + roundIncrement < 0x80000000 );
295 shift32RightJamming( zSig, - zExp, &zSig );
297 roundBits = zSig & 0x7F;
298 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
301 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
302 zSig = ( zSig + roundIncrement )>>7;
303 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
304 if ( zSig == 0 ) zExp = 0;
305 return packFloat32( zSign, zExp, zSig );
309 /*----------------------------------------------------------------------------
310 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
311 | and significand `zSig', and returns the proper single-precision floating-
312 | point value corresponding to the abstract input. This routine is just like
313 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
314 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
315 | floating-point exponent.
316 *----------------------------------------------------------------------------*/
319 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
323 shiftCount = countLeadingZeros32( zSig ) - 1;
324 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
328 /*----------------------------------------------------------------------------
329 | Returns the fraction bits of the double-precision floating-point value `a'.
330 *----------------------------------------------------------------------------*/
332 INLINE bits64 extractFloat64Frac( float64 a )
335 return a & LIT64( 0x000FFFFFFFFFFFFF );
339 /*----------------------------------------------------------------------------
340 | Returns the exponent bits of the double-precision floating-point value `a'.
341 *----------------------------------------------------------------------------*/
343 INLINE int16 extractFloat64Exp( float64 a )
346 return ( a>>52 ) & 0x7FF;
350 /*----------------------------------------------------------------------------
351 | Returns the sign bit of the double-precision floating-point value `a'.
352 *----------------------------------------------------------------------------*/
354 INLINE flag extractFloat64Sign( float64 a )
361 /*----------------------------------------------------------------------------
362 | Normalizes the subnormal double-precision floating-point value represented
363 | by the denormalized significand `aSig'. The normalized exponent and
364 | significand are stored at the locations pointed to by `zExpPtr' and
365 | `zSigPtr', respectively.
366 *----------------------------------------------------------------------------*/
369 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
373 shiftCount = countLeadingZeros64( aSig ) - 11;
374 *zSigPtr = aSig<<shiftCount;
375 *zExpPtr = 1 - shiftCount;
379 /*----------------------------------------------------------------------------
380 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
381 | double-precision floating-point value, returning the result. After being
382 | shifted into the proper positions, the three fields are simply added
383 | together to form the result. This means that any integer portion of `zSig'
384 | will be added into the exponent. Since a properly normalized significand
385 | will have an integer portion equal to 1, the `zExp' input should be 1 less
386 | than the desired result exponent whenever `zSig' is a complete, normalized
388 *----------------------------------------------------------------------------*/
390 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
393 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
397 /*----------------------------------------------------------------------------
398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
399 | and significand `zSig', and returns the proper double-precision floating-
400 | point value corresponding to the abstract input. Ordinarily, the abstract
401 | value is simply rounded and packed into the double-precision format, with
402 | the inexact exception raised if the abstract input cannot be represented
403 | exactly. However, if the abstract value is too large, the overflow and
404 | inexact exceptions are raised and an infinity or maximal finite value is
405 | returned. If the abstract value is too small, the input value is rounded
406 | to a subnormal number, and the underflow and inexact exceptions are raised
407 | if the abstract input cannot be represented exactly as a subnormal double-
408 | precision floating-point number.
409 | The input significand `zSig' has its binary point between bits 62
410 | and 61, which is 10 bits to the left of the usual location. This shifted
411 | significand must be normalized or smaller. If `zSig' is not normalized,
412 | `zExp' must be 0; in that case, the result returned is a subnormal number,
413 | and it must not require rounding. In the usual case that `zSig' is
414 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
415 | The handling of underflow and overflow follows the IEC/IEEE Standard for
416 | Binary Floating-Point Arithmetic.
417 *----------------------------------------------------------------------------*/
419 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
422 flag roundNearestEven;
423 int16 roundIncrement, roundBits;
426 roundingMode = STATUS(float_rounding_mode);
427 roundNearestEven = ( roundingMode == float_round_nearest_even );
428 roundIncrement = 0x200;
429 if ( ! roundNearestEven ) {
430 if ( roundingMode == float_round_to_zero ) {
434 roundIncrement = 0x3FF;
436 if ( roundingMode == float_round_up ) roundIncrement = 0;
439 if ( roundingMode == float_round_down ) roundIncrement = 0;
443 roundBits = zSig & 0x3FF;
444 if ( 0x7FD <= (bits16) zExp ) {
445 if ( ( 0x7FD < zExp )
446 || ( ( zExp == 0x7FD )
447 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
449 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
450 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
454 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
456 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
457 shift64RightJamming( zSig, - zExp, &zSig );
459 roundBits = zSig & 0x3FF;
460 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
463 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
464 zSig = ( zSig + roundIncrement )>>10;
465 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
466 if ( zSig == 0 ) zExp = 0;
467 return packFloat64( zSign, zExp, zSig );
471 /*----------------------------------------------------------------------------
472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
473 | and significand `zSig', and returns the proper double-precision floating-
474 | point value corresponding to the abstract input. This routine is just like
475 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
476 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
477 | floating-point exponent.
478 *----------------------------------------------------------------------------*/
481 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
485 shiftCount = countLeadingZeros64( zSig ) - 1;
486 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492 /*----------------------------------------------------------------------------
493 | Returns the fraction bits of the extended double-precision floating-point
495 *----------------------------------------------------------------------------*/
497 INLINE bits64 extractFloatx80Frac( floatx80 a )
504 /*----------------------------------------------------------------------------
505 | Returns the exponent bits of the extended double-precision floating-point
507 *----------------------------------------------------------------------------*/
509 INLINE int32 extractFloatx80Exp( floatx80 a )
512 return a.high & 0x7FFF;
516 /*----------------------------------------------------------------------------
517 | Returns the sign bit of the extended double-precision floating-point value
519 *----------------------------------------------------------------------------*/
521 INLINE flag extractFloatx80Sign( floatx80 a )
528 /*----------------------------------------------------------------------------
529 | Normalizes the subnormal extended double-precision floating-point value
530 | represented by the denormalized significand `aSig'. The normalized exponent
531 | and significand are stored at the locations pointed to by `zExpPtr' and
532 | `zSigPtr', respectively.
533 *----------------------------------------------------------------------------*/
536 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
540 shiftCount = countLeadingZeros64( aSig );
541 *zSigPtr = aSig<<shiftCount;
542 *zExpPtr = 1 - shiftCount;
546 /*----------------------------------------------------------------------------
547 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
548 | extended double-precision floating-point value, returning the result.
549 *----------------------------------------------------------------------------*/
551 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
556 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
561 /*----------------------------------------------------------------------------
562 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
563 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
564 | and returns the proper extended double-precision floating-point value
565 | corresponding to the abstract input. Ordinarily, the abstract value is
566 | rounded and packed into the extended double-precision format, with the
567 | inexact exception raised if the abstract input cannot be represented
568 | exactly. However, if the abstract value is too large, the overflow and
569 | inexact exceptions are raised and an infinity or maximal finite value is
570 | returned. If the abstract value is too small, the input value is rounded to
571 | a subnormal number, and the underflow and inexact exceptions are raised if
572 | the abstract input cannot be represented exactly as a subnormal extended
573 | double-precision floating-point number.
574 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
575 | number of bits as single or double precision, respectively. Otherwise, the
576 | result is rounded to the full precision of the extended double-precision
578 | The input significand must be normalized or smaller. If the input
579 | significand is not normalized, `zExp' must be 0; in that case, the result
580 | returned is a subnormal number, and it must not require rounding. The
581 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
582 | Floating-Point Arithmetic.
583 *----------------------------------------------------------------------------*/
586 roundAndPackFloatx80(
587 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
591 flag roundNearestEven, increment, isTiny;
592 int64 roundIncrement, roundMask, roundBits;
594 roundingMode = STATUS(float_rounding_mode);
595 roundNearestEven = ( roundingMode == float_round_nearest_even );
596 if ( roundingPrecision == 80 ) goto precision80;
597 if ( roundingPrecision == 64 ) {
598 roundIncrement = LIT64( 0x0000000000000400 );
599 roundMask = LIT64( 0x00000000000007FF );
601 else if ( roundingPrecision == 32 ) {
602 roundIncrement = LIT64( 0x0000008000000000 );
603 roundMask = LIT64( 0x000000FFFFFFFFFF );
608 zSig0 |= ( zSig1 != 0 );
609 if ( ! roundNearestEven ) {
610 if ( roundingMode == float_round_to_zero ) {
614 roundIncrement = roundMask;
616 if ( roundingMode == float_round_up ) roundIncrement = 0;
619 if ( roundingMode == float_round_down ) roundIncrement = 0;
623 roundBits = zSig0 & roundMask;
624 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
625 if ( ( 0x7FFE < zExp )
626 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
634 || ( zSig0 <= zSig0 + roundIncrement );
635 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
637 roundBits = zSig0 & roundMask;
638 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
639 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
640 zSig0 += roundIncrement;
641 if ( (sbits64) zSig0 < 0 ) zExp = 1;
642 roundIncrement = roundMask + 1;
643 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
644 roundMask |= roundIncrement;
646 zSig0 &= ~ roundMask;
647 return packFloatx80( zSign, zExp, zSig0 );
650 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
651 zSig0 += roundIncrement;
652 if ( zSig0 < roundIncrement ) {
654 zSig0 = LIT64( 0x8000000000000000 );
656 roundIncrement = roundMask + 1;
657 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
658 roundMask |= roundIncrement;
660 zSig0 &= ~ roundMask;
661 if ( zSig0 == 0 ) zExp = 0;
662 return packFloatx80( zSign, zExp, zSig0 );
664 increment = ( (sbits64) zSig1 < 0 );
665 if ( ! roundNearestEven ) {
666 if ( roundingMode == float_round_to_zero ) {
671 increment = ( roundingMode == float_round_down ) && zSig1;
674 increment = ( roundingMode == float_round_up ) && zSig1;
678 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
679 if ( ( 0x7FFE < zExp )
680 || ( ( zExp == 0x7FFE )
681 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
688 if ( ( roundingMode == float_round_to_zero )
689 || ( zSign && ( roundingMode == float_round_up ) )
690 || ( ! zSign && ( roundingMode == float_round_down ) )
692 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
694 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
698 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
701 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
702 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
704 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
705 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
706 if ( roundNearestEven ) {
707 increment = ( (sbits64) zSig1 < 0 );
711 increment = ( roundingMode == float_round_down ) && zSig1;
714 increment = ( roundingMode == float_round_up ) && zSig1;
720 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
721 if ( (sbits64) zSig0 < 0 ) zExp = 1;
723 return packFloatx80( zSign, zExp, zSig0 );
726 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
731 zSig0 = LIT64( 0x8000000000000000 );
734 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
738 if ( zSig0 == 0 ) zExp = 0;
740 return packFloatx80( zSign, zExp, zSig0 );
744 /*----------------------------------------------------------------------------
745 | Takes an abstract floating-point value having sign `zSign', exponent
746 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
747 | and returns the proper extended double-precision floating-point value
748 | corresponding to the abstract input. This routine is just like
749 | `roundAndPackFloatx80' except that the input significand does not have to be
751 *----------------------------------------------------------------------------*/
754 normalizeRoundAndPackFloatx80(
755 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
765 shiftCount = countLeadingZeros64( zSig0 );
766 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
769 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
777 /*----------------------------------------------------------------------------
778 | Returns the least-significant 64 fraction bits of the quadruple-precision
779 | floating-point value `a'.
780 *----------------------------------------------------------------------------*/
782 INLINE bits64 extractFloat128Frac1( float128 a )
789 /*----------------------------------------------------------------------------
790 | Returns the most-significant 48 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64 extractFloat128Frac0( float128 a )
797 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
801 /*----------------------------------------------------------------------------
802 | Returns the exponent bits of the quadruple-precision floating-point value
804 *----------------------------------------------------------------------------*/
806 INLINE int32 extractFloat128Exp( float128 a )
809 return ( a.high>>48 ) & 0x7FFF;
813 /*----------------------------------------------------------------------------
814 | Returns the sign bit of the quadruple-precision floating-point value `a'.
815 *----------------------------------------------------------------------------*/
817 INLINE flag extractFloat128Sign( float128 a )
824 /*----------------------------------------------------------------------------
825 | Normalizes the subnormal quadruple-precision floating-point value
826 | represented by the denormalized significand formed by the concatenation of
827 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
828 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
829 | significand are stored at the location pointed to by `zSig0Ptr', and the
830 | least significant 64 bits of the normalized significand are stored at the
831 | location pointed to by `zSig1Ptr'.
832 *----------------------------------------------------------------------------*/
835 normalizeFloat128Subnormal(
846 shiftCount = countLeadingZeros64( aSig1 ) - 15;
847 if ( shiftCount < 0 ) {
848 *zSig0Ptr = aSig1>>( - shiftCount );
849 *zSig1Ptr = aSig1<<( shiftCount & 63 );
852 *zSig0Ptr = aSig1<<shiftCount;
855 *zExpPtr = - shiftCount - 63;
858 shiftCount = countLeadingZeros64( aSig0 ) - 15;
859 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
860 *zExpPtr = 1 - shiftCount;
865 /*----------------------------------------------------------------------------
866 | Packs the sign `zSign', the exponent `zExp', and the significand formed
867 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
868 | floating-point value, returning the result. After being shifted into the
869 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
870 | added together to form the most significant 32 bits of the result. This
871 | means that any integer portion of `zSig0' will be added into the exponent.
872 | Since a properly normalized significand will have an integer portion equal
873 | to 1, the `zExp' input should be 1 less than the desired result exponent
874 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
876 *----------------------------------------------------------------------------*/
879 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
884 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
889 /*----------------------------------------------------------------------------
890 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
891 | and extended significand formed by the concatenation of `zSig0', `zSig1',
892 | and `zSig2', and returns the proper quadruple-precision floating-point value
893 | corresponding to the abstract input. Ordinarily, the abstract value is
894 | simply rounded and packed into the quadruple-precision format, with the
895 | inexact exception raised if the abstract input cannot be represented
896 | exactly. However, if the abstract value is too large, the overflow and
897 | inexact exceptions are raised and an infinity or maximal finite value is
898 | returned. If the abstract value is too small, the input value is rounded to
899 | a subnormal number, and the underflow and inexact exceptions are raised if
900 | the abstract input cannot be represented exactly as a subnormal quadruple-
901 | precision floating-point number.
902 | The input significand must be normalized or smaller. If the input
903 | significand is not normalized, `zExp' must be 0; in that case, the result
904 | returned is a subnormal number, and it must not require rounding. In the
905 | usual case that the input significand is normalized, `zExp' must be 1 less
906 | than the ``true'' floating-point exponent. The handling of underflow and
907 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
908 *----------------------------------------------------------------------------*/
911 roundAndPackFloat128(
912 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
915 flag roundNearestEven, increment, isTiny;
917 roundingMode = STATUS(float_rounding_mode);
918 roundNearestEven = ( roundingMode == float_round_nearest_even );
919 increment = ( (sbits64) zSig2 < 0 );
920 if ( ! roundNearestEven ) {
921 if ( roundingMode == float_round_to_zero ) {
926 increment = ( roundingMode == float_round_down ) && zSig2;
929 increment = ( roundingMode == float_round_up ) && zSig2;
933 if ( 0x7FFD <= (bits32) zExp ) {
934 if ( ( 0x7FFD < zExp )
935 || ( ( zExp == 0x7FFD )
937 LIT64( 0x0001FFFFFFFFFFFF ),
938 LIT64( 0xFFFFFFFFFFFFFFFF ),
945 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
946 if ( ( roundingMode == float_round_to_zero )
947 || ( zSign && ( roundingMode == float_round_up ) )
948 || ( ! zSign && ( roundingMode == float_round_down ) )
954 LIT64( 0x0000FFFFFFFFFFFF ),
955 LIT64( 0xFFFFFFFFFFFFFFFF )
958 return packFloat128( zSign, 0x7FFF, 0, 0 );
962 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968 LIT64( 0x0001FFFFFFFFFFFF ),
969 LIT64( 0xFFFFFFFFFFFFFFFF )
971 shift128ExtraRightJamming(
972 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
974 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
975 if ( roundNearestEven ) {
976 increment = ( (sbits64) zSig2 < 0 );
980 increment = ( roundingMode == float_round_down ) && zSig2;
983 increment = ( roundingMode == float_round_up ) && zSig2;
988 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
990 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
991 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
994 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
996 return packFloat128( zSign, zExp, zSig0, zSig1 );
1000 /*----------------------------------------------------------------------------
1001 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1002 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1003 | returns the proper quadruple-precision floating-point value corresponding
1004 | to the abstract input. This routine is just like `roundAndPackFloat128'
1005 | except that the input significand has fewer bits and does not have to be
1006 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1008 *----------------------------------------------------------------------------*/
1011 normalizeRoundAndPackFloat128(
1012 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1022 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1023 if ( 0 <= shiftCount ) {
1025 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1028 shift128ExtraRightJamming(
1029 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1032 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038 /*----------------------------------------------------------------------------
1039 | Returns the result of converting the 32-bit two's complement integer `a'
1040 | to the single-precision floating-point format. The conversion is performed
1041 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1042 *----------------------------------------------------------------------------*/
1044 float32 int32_to_float32( int32 a STATUS_PARAM )
1048 if ( a == 0 ) return 0;
1049 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1051 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1055 /*----------------------------------------------------------------------------
1056 | Returns the result of converting the 32-bit two's complement integer `a'
1057 | to the double-precision floating-point format. The conversion is performed
1058 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1059 *----------------------------------------------------------------------------*/
1061 float64 int32_to_float64( int32 a STATUS_PARAM )
1068 if ( a == 0 ) return 0;
1070 absA = zSign ? - a : a;
1071 shiftCount = countLeadingZeros32( absA ) + 21;
1073 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079 /*----------------------------------------------------------------------------
1080 | Returns the result of converting the 32-bit two's complement integer `a'
1081 | to the extended double-precision floating-point format. The conversion
1082 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1084 *----------------------------------------------------------------------------*/
1086 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1093 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1095 absA = zSign ? - a : a;
1096 shiftCount = countLeadingZeros32( absA ) + 32;
1098 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a' to
1108 | the quadruple-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float128 int32_to_float128( int32 a STATUS_PARAM )
1119 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1121 absA = zSign ? - a : a;
1122 shiftCount = countLeadingZeros32( absA ) + 17;
1124 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130 /*----------------------------------------------------------------------------
1131 | Returns the result of converting the 64-bit two's complement integer `a'
1132 | to the single-precision floating-point format. The conversion is performed
1133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 *----------------------------------------------------------------------------*/
1136 float32 int64_to_float32( int64 a STATUS_PARAM )
1142 if ( a == 0 ) return 0;
1144 absA = zSign ? - a : a;
1145 shiftCount = countLeadingZeros64( absA ) - 40;
1146 if ( 0 <= shiftCount ) {
1147 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1151 if ( shiftCount < 0 ) {
1152 shift64RightJamming( absA, - shiftCount, &absA );
1155 absA <<= shiftCount;
1157 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1162 /*----------------------------------------------------------------------------
1163 | Returns the result of converting the 64-bit two's complement integer `a'
1164 | to the double-precision floating-point format. The conversion is performed
1165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166 *----------------------------------------------------------------------------*/
1168 float64 int64_to_float64( int64 a STATUS_PARAM )
1172 if ( a == 0 ) return 0;
1173 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1174 return packFloat64( 1, 0x43E, 0 );
1177 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 64-bit two's complement integer `a'
1185 | to the extended double-precision floating-point format. The conversion
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1188 *----------------------------------------------------------------------------*/
1190 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1198 absA = zSign ? - a : a;
1199 shiftCount = countLeadingZeros64( absA );
1200 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a' to
1210 | the quadruple-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1214 float128 int64_to_float128( int64 a STATUS_PARAM )
1220 bits64 zSig0, zSig1;
1222 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1224 absA = zSign ? - a : a;
1225 shiftCount = countLeadingZeros64( absA ) + 49;
1226 zExp = 0x406E - shiftCount;
1227 if ( 64 <= shiftCount ) {
1236 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1237 return packFloat128( zSign, zExp, zSig0, zSig1 );
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the single-precision floating-point value
1245 | `a' to the 32-bit two's complement integer format. The conversion is
1246 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1247 | Arithmetic---which means in particular that the conversion is rounded
1248 | according to the current rounding mode. If `a' is a NaN, the largest
1249 | positive integer is returned. Otherwise, if the conversion overflows, the
1250 | largest integer with the same sign as `a' is returned.
1251 *----------------------------------------------------------------------------*/
1253 int32 float32_to_int32( float32 a STATUS_PARAM )
1256 int16 aExp, shiftCount;
1260 aSig = extractFloat32Frac( a );
1261 aExp = extractFloat32Exp( a );
1262 aSign = extractFloat32Sign( a );
1263 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1264 if ( aExp ) aSig |= 0x00800000;
1265 shiftCount = 0xAF - aExp;
1268 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1269 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the single-precision floating-point value
1275 | `a' to the 32-bit two's complement integer format. The conversion is
1276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 | Arithmetic, except that the conversion is always rounded toward zero.
1278 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1279 | the conversion overflows, the largest integer with the same sign as `a' is
1281 *----------------------------------------------------------------------------*/
1283 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1286 int16 aExp, shiftCount;
1290 aSig = extractFloat32Frac( a );
1291 aExp = extractFloat32Exp( a );
1292 aSign = extractFloat32Sign( a );
1293 shiftCount = aExp - 0x9E;
1294 if ( 0 <= shiftCount ) {
1295 if ( a != 0xCF000000 ) {
1296 float_raise( float_flag_invalid STATUS_VAR);
1297 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1299 return (sbits32) 0x80000000;
1301 else if ( aExp <= 0x7E ) {
1302 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1305 aSig = ( aSig | 0x00800000 )<<8;
1306 z = aSig>>( - shiftCount );
1307 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1308 STATUS(float_exception_flags) |= float_flag_inexact;
1310 if ( aSign ) z = - z;
1315 /*----------------------------------------------------------------------------
1316 | Returns the result of converting the single-precision floating-point value
1317 | `a' to the 64-bit two's complement integer format. The conversion is
1318 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 | Arithmetic---which means in particular that the conversion is rounded
1320 | according to the current rounding mode. If `a' is a NaN, the largest
1321 | positive integer is returned. Otherwise, if the conversion overflows, the
1322 | largest integer with the same sign as `a' is returned.
1323 *----------------------------------------------------------------------------*/
1325 int64 float32_to_int64( float32 a STATUS_PARAM )
1328 int16 aExp, shiftCount;
1330 bits64 aSig64, aSigExtra;
1332 aSig = extractFloat32Frac( a );
1333 aExp = extractFloat32Exp( a );
1334 aSign = extractFloat32Sign( a );
1335 shiftCount = 0xBE - aExp;
1336 if ( shiftCount < 0 ) {
1337 float_raise( float_flag_invalid STATUS_VAR);
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1339 return LIT64( 0x7FFFFFFFFFFFFFFF );
1341 return (sbits64) LIT64( 0x8000000000000000 );
1343 if ( aExp ) aSig |= 0x00800000;
1346 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1347 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the single-precision floating-point value
1353 | `a' to the 64-bit two's complement integer format. The conversion is
1354 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1355 | Arithmetic, except that the conversion is always rounded toward zero. If
1356 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1357 | conversion overflows, the largest integer with the same sign as `a' is
1359 *----------------------------------------------------------------------------*/
1361 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1364 int16 aExp, shiftCount;
1369 aSig = extractFloat32Frac( a );
1370 aExp = extractFloat32Exp( a );
1371 aSign = extractFloat32Sign( a );
1372 shiftCount = aExp - 0xBE;
1373 if ( 0 <= shiftCount ) {
1374 if ( a != 0xDF000000 ) {
1375 float_raise( float_flag_invalid STATUS_VAR);
1376 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1377 return LIT64( 0x7FFFFFFFFFFFFFFF );
1380 return (sbits64) LIT64( 0x8000000000000000 );
1382 else if ( aExp <= 0x7E ) {
1383 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1386 aSig64 = aSig | 0x00800000;
1388 z = aSig64>>( - shiftCount );
1389 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1390 STATUS(float_exception_flags) |= float_flag_inexact;
1392 if ( aSign ) z = - z;
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the double-precision floating-point format. The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1402 *----------------------------------------------------------------------------*/
1404 float64 float32_to_float64( float32 a STATUS_PARAM )
1410 aSig = extractFloat32Frac( a );
1411 aExp = extractFloat32Exp( a );
1412 aSign = extractFloat32Sign( a );
1413 if ( aExp == 0xFF ) {
1414 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1415 return packFloat64( aSign, 0x7FF, 0 );
1418 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1419 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1422 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the single-precision floating-point value
1430 | `a' to the extended double-precision floating-point format. The conversion
1431 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1433 *----------------------------------------------------------------------------*/
1435 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( aExp == 0xFF ) {
1445 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1446 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1449 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1450 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1453 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the double-precision floating-point format. The conversion is
1464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 *----------------------------------------------------------------------------*/
1468 float128 float32_to_float128( float32 a STATUS_PARAM )
1474 aSig = extractFloat32Frac( a );
1475 aExp = extractFloat32Exp( a );
1476 aSign = extractFloat32Sign( a );
1477 if ( aExp == 0xFF ) {
1478 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1479 return packFloat128( aSign, 0x7FFF, 0, 0 );
1482 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1483 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1486 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492 /*----------------------------------------------------------------------------
1493 | Rounds the single-precision floating-point value `a' to an integer, and
1494 | returns the result as a single-precision floating-point value. The
1495 | operation is performed according to the IEC/IEEE Standard for Binary
1496 | Floating-Point Arithmetic.
1497 *----------------------------------------------------------------------------*/
1499 float32 float32_round_to_int( float32 a STATUS_PARAM)
1503 bits32 lastBitMask, roundBitsMask;
1507 aExp = extractFloat32Exp( a );
1508 if ( 0x96 <= aExp ) {
1509 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1510 return propagateFloat32NaN( a, a STATUS_VAR );
1514 if ( aExp <= 0x7E ) {
1515 if ( (bits32) ( a<<1 ) == 0 ) return a;
1516 STATUS(float_exception_flags) |= float_flag_inexact;
1517 aSign = extractFloat32Sign( a );
1518 switch ( STATUS(float_rounding_mode) ) {
1519 case float_round_nearest_even:
1520 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1521 return packFloat32( aSign, 0x7F, 0 );
1524 case float_round_down:
1525 return aSign ? 0xBF800000 : 0;
1526 case float_round_up:
1527 return aSign ? 0x80000000 : 0x3F800000;
1529 return packFloat32( aSign, 0, 0 );
1532 lastBitMask <<= 0x96 - aExp;
1533 roundBitsMask = lastBitMask - 1;
1535 roundingMode = STATUS(float_rounding_mode);
1536 if ( roundingMode == float_round_nearest_even ) {
1537 z += lastBitMask>>1;
1538 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1540 else if ( roundingMode != float_round_to_zero ) {
1541 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1545 z &= ~ roundBitsMask;
1546 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of adding the absolute values of the single-precision
1553 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1554 | before being returned. `zSign' is ignored if the result is a NaN.
1555 | The addition is performed according to the IEC/IEEE Standard for Binary
1556 | Floating-Point Arithmetic.
1557 *----------------------------------------------------------------------------*/
1559 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1561 int16 aExp, bExp, zExp;
1562 bits32 aSig, bSig, zSig;
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 bSig = extractFloat32Frac( b );
1568 bExp = extractFloat32Exp( b );
1569 expDiff = aExp - bExp;
1572 if ( 0 < expDiff ) {
1573 if ( aExp == 0xFF ) {
1574 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1583 shift32RightJamming( bSig, expDiff, &bSig );
1586 else if ( expDiff < 0 ) {
1587 if ( bExp == 0xFF ) {
1588 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1589 return packFloat32( zSign, 0xFF, 0 );
1597 shift32RightJamming( aSig, - expDiff, &aSig );
1601 if ( aExp == 0xFF ) {
1602 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1605 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1606 zSig = 0x40000000 + aSig + bSig;
1611 zSig = ( aSig + bSig )<<1;
1613 if ( (sbits32) zSig < 0 ) {
1618 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of subtracting the absolute values of the single-
1624 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1625 | difference is negated before being returned. `zSign' is ignored if the
1626 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1627 | Standard for Binary Floating-Point Arithmetic.
1628 *----------------------------------------------------------------------------*/
1630 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1632 int16 aExp, bExp, zExp;
1633 bits32 aSig, bSig, zSig;
1636 aSig = extractFloat32Frac( a );
1637 aExp = extractFloat32Exp( a );
1638 bSig = extractFloat32Frac( b );
1639 bExp = extractFloat32Exp( b );
1640 expDiff = aExp - bExp;
1643 if ( 0 < expDiff ) goto aExpBigger;
1644 if ( expDiff < 0 ) goto bExpBigger;
1645 if ( aExp == 0xFF ) {
1646 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1647 float_raise( float_flag_invalid STATUS_VAR);
1648 return float32_default_nan;
1654 if ( bSig < aSig ) goto aBigger;
1655 if ( aSig < bSig ) goto bBigger;
1656 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1658 if ( bExp == 0xFF ) {
1659 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1660 return packFloat32( zSign ^ 1, 0xFF, 0 );
1668 shift32RightJamming( aSig, - expDiff, &aSig );
1674 goto normalizeRoundAndPack;
1676 if ( aExp == 0xFF ) {
1677 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1686 shift32RightJamming( bSig, expDiff, &bSig );
1691 normalizeRoundAndPack:
1693 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1697 /*----------------------------------------------------------------------------
1698 | Returns the result of adding the single-precision floating-point values `a'
1699 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1700 | Binary Floating-Point Arithmetic.
1701 *----------------------------------------------------------------------------*/
1703 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1707 aSign = extractFloat32Sign( a );
1708 bSign = extractFloat32Sign( b );
1709 if ( aSign == bSign ) {
1710 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1713 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of subtracting the single-precision floating-point values
1720 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1721 | for Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1724 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1728 aSign = extractFloat32Sign( a );
1729 bSign = extractFloat32Sign( b );
1730 if ( aSign == bSign ) {
1731 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1734 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1739 /*----------------------------------------------------------------------------
1740 | Returns the result of multiplying the single-precision floating-point values
1741 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1742 | for Binary Floating-Point Arithmetic.
1743 *----------------------------------------------------------------------------*/
1745 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1747 flag aSign, bSign, zSign;
1748 int16 aExp, bExp, zExp;
1753 aSig = extractFloat32Frac( a );
1754 aExp = extractFloat32Exp( a );
1755 aSign = extractFloat32Sign( a );
1756 bSig = extractFloat32Frac( b );
1757 bExp = extractFloat32Exp( b );
1758 bSign = extractFloat32Sign( b );
1759 zSign = aSign ^ bSign;
1760 if ( aExp == 0xFF ) {
1761 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1762 return propagateFloat32NaN( a, b STATUS_VAR );
1764 if ( ( bExp | bSig ) == 0 ) {
1765 float_raise( float_flag_invalid STATUS_VAR);
1766 return float32_default_nan;
1768 return packFloat32( zSign, 0xFF, 0 );
1770 if ( bExp == 0xFF ) {
1771 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1772 if ( ( aExp | aSig ) == 0 ) {
1773 float_raise( float_flag_invalid STATUS_VAR);
1774 return float32_default_nan;
1776 return packFloat32( zSign, 0xFF, 0 );
1779 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1780 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1783 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1784 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1786 zExp = aExp + bExp - 0x7F;
1787 aSig = ( aSig | 0x00800000 )<<7;
1788 bSig = ( bSig | 0x00800000 )<<8;
1789 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1791 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1795 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1799 /*----------------------------------------------------------------------------
1800 | Returns the result of dividing the single-precision floating-point value `a'
1801 | by the corresponding value `b'. The operation is performed according to the
1802 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803 *----------------------------------------------------------------------------*/
1805 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1807 flag aSign, bSign, zSign;
1808 int16 aExp, bExp, zExp;
1809 bits32 aSig, bSig, zSig;
1811 aSig = extractFloat32Frac( a );
1812 aExp = extractFloat32Exp( a );
1813 aSign = extractFloat32Sign( a );
1814 bSig = extractFloat32Frac( b );
1815 bExp = extractFloat32Exp( b );
1816 bSign = extractFloat32Sign( b );
1817 zSign = aSign ^ bSign;
1818 if ( aExp == 0xFF ) {
1819 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1820 if ( bExp == 0xFF ) {
1821 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822 float_raise( float_flag_invalid STATUS_VAR);
1823 return float32_default_nan;
1825 return packFloat32( zSign, 0xFF, 0 );
1827 if ( bExp == 0xFF ) {
1828 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1829 return packFloat32( zSign, 0, 0 );
1833 if ( ( aExp | aSig ) == 0 ) {
1834 float_raise( float_flag_invalid STATUS_VAR);
1835 return float32_default_nan;
1837 float_raise( float_flag_divbyzero STATUS_VAR);
1838 return packFloat32( zSign, 0xFF, 0 );
1840 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1843 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1844 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1846 zExp = aExp - bExp + 0x7D;
1847 aSig = ( aSig | 0x00800000 )<<7;
1848 bSig = ( bSig | 0x00800000 )<<8;
1849 if ( bSig <= ( aSig + aSig ) ) {
1853 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1854 if ( ( zSig & 0x3F ) == 0 ) {
1855 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1857 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1861 /*----------------------------------------------------------------------------
1862 | Returns the remainder of the single-precision floating-point value `a'
1863 | with respect to the corresponding value `b'. The operation is performed
1864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1867 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1869 flag aSign, bSign, zSign;
1870 int16 aExp, bExp, expDiff;
1873 bits64 aSig64, bSig64, q64;
1874 bits32 alternateASig;
1877 aSig = extractFloat32Frac( a );
1878 aExp = extractFloat32Exp( a );
1879 aSign = extractFloat32Sign( a );
1880 bSig = extractFloat32Frac( b );
1881 bExp = extractFloat32Exp( b );
1882 bSign = extractFloat32Sign( b );
1883 if ( aExp == 0xFF ) {
1884 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1885 return propagateFloat32NaN( a, b STATUS_VAR );
1887 float_raise( float_flag_invalid STATUS_VAR);
1888 return float32_default_nan;
1890 if ( bExp == 0xFF ) {
1891 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1896 float_raise( float_flag_invalid STATUS_VAR);
1897 return float32_default_nan;
1899 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1902 if ( aSig == 0 ) return a;
1903 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1905 expDiff = aExp - bExp;
1908 if ( expDiff < 32 ) {
1911 if ( expDiff < 0 ) {
1912 if ( expDiff < -1 ) return a;
1915 q = ( bSig <= aSig );
1916 if ( q ) aSig -= bSig;
1917 if ( 0 < expDiff ) {
1918 q = ( ( (bits64) aSig )<<32 ) / bSig;
1921 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1929 if ( bSig <= aSig ) aSig -= bSig;
1930 aSig64 = ( (bits64) aSig )<<40;
1931 bSig64 = ( (bits64) bSig )<<40;
1933 while ( 0 < expDiff ) {
1934 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1935 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1936 aSig64 = - ( ( bSig * q64 )<<38 );
1940 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1941 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1942 q = q64>>( 64 - expDiff );
1944 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1947 alternateASig = aSig;
1950 } while ( 0 <= (sbits32) aSig );
1951 sigMean = aSig + alternateASig;
1952 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1953 aSig = alternateASig;
1955 zSign = ( (sbits32) aSig < 0 );
1956 if ( zSign ) aSig = - aSig;
1957 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1961 /*----------------------------------------------------------------------------
1962 | Returns the square root of the single-precision floating-point value `a'.
1963 | The operation is performed according to the IEC/IEEE Standard for Binary
1964 | Floating-Point Arithmetic.
1965 *----------------------------------------------------------------------------*/
1967 float32 float32_sqrt( float32 a STATUS_PARAM )
1974 aSig = extractFloat32Frac( a );
1975 aExp = extractFloat32Exp( a );
1976 aSign = extractFloat32Sign( a );
1977 if ( aExp == 0xFF ) {
1978 if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1979 if ( ! aSign ) return a;
1980 float_raise( float_flag_invalid STATUS_VAR);
1981 return float32_default_nan;
1984 if ( ( aExp | aSig ) == 0 ) return a;
1985 float_raise( float_flag_invalid STATUS_VAR);
1986 return float32_default_nan;
1989 if ( aSig == 0 ) return 0;
1990 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1992 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1993 aSig = ( aSig | 0x00800000 )<<8;
1994 zSig = estimateSqrt32( aExp, aSig ) + 2;
1995 if ( ( zSig & 0x7F ) <= 5 ) {
2001 term = ( (bits64) zSig ) * zSig;
2002 rem = ( ( (bits64) aSig )<<32 ) - term;
2003 while ( (sbits64) rem < 0 ) {
2005 rem += ( ( (bits64) zSig )<<1 ) | 1;
2007 zSig |= ( rem != 0 );
2009 shift32RightJamming( zSig, 1, &zSig );
2011 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2015 /*----------------------------------------------------------------------------
2016 | Returns 1 if the single-precision floating-point value `a' is equal to
2017 | the corresponding value `b', and 0 otherwise. The comparison is performed
2018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 *----------------------------------------------------------------------------*/
2021 flag float32_eq( float32 a, float32 b STATUS_PARAM )
2024 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2025 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2027 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2028 float_raise( float_flag_invalid STATUS_VAR);
2032 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2036 /*----------------------------------------------------------------------------
2037 | Returns 1 if the single-precision floating-point value `a' is less than
2038 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2041 *----------------------------------------------------------------------------*/
2043 flag float32_le( float32 a, float32 b STATUS_PARAM )
2047 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2048 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2050 float_raise( float_flag_invalid STATUS_VAR);
2053 aSign = extractFloat32Sign( a );
2054 bSign = extractFloat32Sign( b );
2055 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2056 return ( a == b ) || ( aSign ^ ( a < b ) );
2060 /*----------------------------------------------------------------------------
2061 | Returns 1 if the single-precision floating-point value `a' is less than
2062 | the corresponding value `b', and 0 otherwise. The comparison is performed
2063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064 *----------------------------------------------------------------------------*/
2066 flag float32_lt( float32 a, float32 b STATUS_PARAM )
2070 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2071 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2073 float_raise( float_flag_invalid STATUS_VAR);
2076 aSign = extractFloat32Sign( a );
2077 bSign = extractFloat32Sign( b );
2078 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2079 return ( a != b ) && ( aSign ^ ( a < b ) );
2083 /*----------------------------------------------------------------------------
2084 | Returns 1 if the single-precision floating-point value `a' is equal to
2085 | the corresponding value `b', and 0 otherwise. The invalid exception is
2086 | raised if either operand is a NaN. Otherwise, the comparison is performed
2087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088 *----------------------------------------------------------------------------*/
2090 flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2093 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2096 float_raise( float_flag_invalid STATUS_VAR);
2099 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2103 /*----------------------------------------------------------------------------
2104 | Returns 1 if the single-precision floating-point value `a' is less than or
2105 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2106 | cause an exception. Otherwise, the comparison is performed according to the
2107 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108 *----------------------------------------------------------------------------*/
2110 flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118 float_raise( float_flag_invalid STATUS_VAR);
2122 aSign = extractFloat32Sign( a );
2123 bSign = extractFloat32Sign( b );
2124 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2125 return ( a == b ) || ( aSign ^ ( a < b ) );
2129 /*----------------------------------------------------------------------------
2130 | Returns 1 if the single-precision floating-point value `a' is less than
2131 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2132 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2133 | Standard for Binary Floating-Point Arithmetic.
2134 *----------------------------------------------------------------------------*/
2136 flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2140 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2143 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2144 float_raise( float_flag_invalid STATUS_VAR);
2148 aSign = extractFloat32Sign( a );
2149 bSign = extractFloat32Sign( b );
2150 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2151 return ( a != b ) && ( aSign ^ ( a < b ) );
2155 /*----------------------------------------------------------------------------
2156 | Returns the result of converting the double-precision floating-point value
2157 | `a' to the 32-bit two's complement integer format. The conversion is
2158 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2159 | Arithmetic---which means in particular that the conversion is rounded
2160 | according to the current rounding mode. If `a' is a NaN, the largest
2161 | positive integer is returned. Otherwise, if the conversion overflows, the
2162 | largest integer with the same sign as `a' is returned.
2163 *----------------------------------------------------------------------------*/
2165 int32 float64_to_int32( float64 a STATUS_PARAM )
2168 int16 aExp, shiftCount;
2171 aSig = extractFloat64Frac( a );
2172 aExp = extractFloat64Exp( a );
2173 aSign = extractFloat64Sign( a );
2174 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2175 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2176 shiftCount = 0x42C - aExp;
2177 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2178 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2182 /*----------------------------------------------------------------------------
2183 | Returns the result of converting the double-precision floating-point value
2184 | `a' to the 32-bit two's complement integer format. The conversion is
2185 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2186 | Arithmetic, except that the conversion is always rounded toward zero.
2187 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2188 | the conversion overflows, the largest integer with the same sign as `a' is
2190 *----------------------------------------------------------------------------*/
2192 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2195 int16 aExp, shiftCount;
2196 bits64 aSig, savedASig;
2199 aSig = extractFloat64Frac( a );
2200 aExp = extractFloat64Exp( a );
2201 aSign = extractFloat64Sign( a );
2202 if ( 0x41E < aExp ) {
2203 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2206 else if ( aExp < 0x3FF ) {
2207 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2210 aSig |= LIT64( 0x0010000000000000 );
2211 shiftCount = 0x433 - aExp;
2213 aSig >>= shiftCount;
2215 if ( aSign ) z = - z;
2216 if ( ( z < 0 ) ^ aSign ) {
2218 float_raise( float_flag_invalid STATUS_VAR);
2219 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2221 if ( ( aSig<<shiftCount ) != savedASig ) {
2222 STATUS(float_exception_flags) |= float_flag_inexact;
2228 /*----------------------------------------------------------------------------
2229 | Returns the result of converting the double-precision floating-point value
2230 | `a' to the 64-bit two's complement integer format. The conversion is
2231 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2232 | Arithmetic---which means in particular that the conversion is rounded
2233 | according to the current rounding mode. If `a' is a NaN, the largest
2234 | positive integer is returned. Otherwise, if the conversion overflows, the
2235 | largest integer with the same sign as `a' is returned.
2236 *----------------------------------------------------------------------------*/
2238 int64 float64_to_int64( float64 a STATUS_PARAM )
2241 int16 aExp, shiftCount;
2242 bits64 aSig, aSigExtra;
2244 aSig = extractFloat64Frac( a );
2245 aExp = extractFloat64Exp( a );
2246 aSign = extractFloat64Sign( a );
2247 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2248 shiftCount = 0x433 - aExp;
2249 if ( shiftCount <= 0 ) {
2250 if ( 0x43E < aExp ) {
2251 float_raise( float_flag_invalid STATUS_VAR);
2253 || ( ( aExp == 0x7FF )
2254 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2256 return LIT64( 0x7FFFFFFFFFFFFFFF );
2258 return (sbits64) LIT64( 0x8000000000000000 );
2261 aSig <<= - shiftCount;
2264 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2266 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2270 /*----------------------------------------------------------------------------
2271 | Returns the result of converting the double-precision floating-point value
2272 | `a' to the 64-bit two's complement integer format. The conversion is
2273 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2274 | Arithmetic, except that the conversion is always rounded toward zero.
2275 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2276 | the conversion overflows, the largest integer with the same sign as `a' is
2278 *----------------------------------------------------------------------------*/
2280 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2283 int16 aExp, shiftCount;
2287 aSig = extractFloat64Frac( a );
2288 aExp = extractFloat64Exp( a );
2289 aSign = extractFloat64Sign( a );
2290 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2291 shiftCount = aExp - 0x433;
2292 if ( 0 <= shiftCount ) {
2293 if ( 0x43E <= aExp ) {
2294 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2295 float_raise( float_flag_invalid STATUS_VAR);
2297 || ( ( aExp == 0x7FF )
2298 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2300 return LIT64( 0x7FFFFFFFFFFFFFFF );
2303 return (sbits64) LIT64( 0x8000000000000000 );
2305 z = aSig<<shiftCount;
2308 if ( aExp < 0x3FE ) {
2309 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2312 z = aSig>>( - shiftCount );
2313 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2314 STATUS(float_exception_flags) |= float_flag_inexact;
2317 if ( aSign ) z = - z;
2322 /*----------------------------------------------------------------------------
2323 | Returns the result of converting the double-precision floating-point value
2324 | `a' to the single-precision floating-point format. The conversion is
2325 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 *----------------------------------------------------------------------------*/
2329 float32 float64_to_float32( float64 a STATUS_PARAM )
2336 aSig = extractFloat64Frac( a );
2337 aExp = extractFloat64Exp( a );
2338 aSign = extractFloat64Sign( a );
2339 if ( aExp == 0x7FF ) {
2340 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2341 return packFloat32( aSign, 0xFF, 0 );
2343 shift64RightJamming( aSig, 22, &aSig );
2345 if ( aExp || zSig ) {
2349 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355 /*----------------------------------------------------------------------------
2356 | Returns the result of converting the double-precision floating-point value
2357 | `a' to the extended double-precision floating-point format. The conversion
2358 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 *----------------------------------------------------------------------------*/
2362 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368 aSig = extractFloat64Frac( a );
2369 aExp = extractFloat64Exp( a );
2370 aSign = extractFloat64Sign( a );
2371 if ( aExp == 0x7FF ) {
2372 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2373 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2376 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2377 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2381 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2389 /*----------------------------------------------------------------------------
2390 | Returns the result of converting the double-precision floating-point value
2391 | `a' to the quadruple-precision floating-point format. The conversion is
2392 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2394 *----------------------------------------------------------------------------*/
2396 float128 float64_to_float128( float64 a STATUS_PARAM )
2400 bits64 aSig, zSig0, zSig1;
2402 aSig = extractFloat64Frac( a );
2403 aExp = extractFloat64Exp( a );
2404 aSign = extractFloat64Sign( a );
2405 if ( aExp == 0x7FF ) {
2406 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2407 return packFloat128( aSign, 0x7FFF, 0, 0 );
2410 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2411 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2414 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2415 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421 /*----------------------------------------------------------------------------
2422 | Rounds the double-precision floating-point value `a' to an integer, and
2423 | returns the result as a double-precision floating-point value. The
2424 | operation is performed according to the IEC/IEEE Standard for Binary
2425 | Floating-Point Arithmetic.
2426 *----------------------------------------------------------------------------*/
2428 float64 float64_round_to_int( float64 a STATUS_PARAM )
2432 bits64 lastBitMask, roundBitsMask;
2436 aExp = extractFloat64Exp( a );
2437 if ( 0x433 <= aExp ) {
2438 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2439 return propagateFloat64NaN( a, a STATUS_VAR );
2443 if ( aExp < 0x3FF ) {
2444 if ( (bits64) ( a<<1 ) == 0 ) return a;
2445 STATUS(float_exception_flags) |= float_flag_inexact;
2446 aSign = extractFloat64Sign( a );
2447 switch ( STATUS(float_rounding_mode) ) {
2448 case float_round_nearest_even:
2449 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2450 return packFloat64( aSign, 0x3FF, 0 );
2453 case float_round_down:
2454 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2455 case float_round_up:
2457 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2459 return packFloat64( aSign, 0, 0 );
2462 lastBitMask <<= 0x433 - aExp;
2463 roundBitsMask = lastBitMask - 1;
2465 roundingMode = STATUS(float_rounding_mode);
2466 if ( roundingMode == float_round_nearest_even ) {
2467 z += lastBitMask>>1;
2468 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2470 else if ( roundingMode != float_round_to_zero ) {
2471 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2475 z &= ~ roundBitsMask;
2476 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2481 /*----------------------------------------------------------------------------
2482 | Returns the result of adding the absolute values of the double-precision
2483 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2484 | before being returned. `zSign' is ignored if the result is a NaN.
2485 | The addition is performed according to the IEC/IEEE Standard for Binary
2486 | Floating-Point Arithmetic.
2487 *----------------------------------------------------------------------------*/
2489 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2491 int16 aExp, bExp, zExp;
2492 bits64 aSig, bSig, zSig;
2495 aSig = extractFloat64Frac( a );
2496 aExp = extractFloat64Exp( a );
2497 bSig = extractFloat64Frac( b );
2498 bExp = extractFloat64Exp( b );
2499 expDiff = aExp - bExp;
2502 if ( 0 < expDiff ) {
2503 if ( aExp == 0x7FF ) {
2504 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2511 bSig |= LIT64( 0x2000000000000000 );
2513 shift64RightJamming( bSig, expDiff, &bSig );
2516 else if ( expDiff < 0 ) {
2517 if ( bExp == 0x7FF ) {
2518 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2519 return packFloat64( zSign, 0x7FF, 0 );
2525 aSig |= LIT64( 0x2000000000000000 );
2527 shift64RightJamming( aSig, - expDiff, &aSig );
2531 if ( aExp == 0x7FF ) {
2532 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2535 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2536 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2540 aSig |= LIT64( 0x2000000000000000 );
2541 zSig = ( aSig + bSig )<<1;
2543 if ( (sbits64) zSig < 0 ) {
2548 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2552 /*----------------------------------------------------------------------------
2553 | Returns the result of subtracting the absolute values of the double-
2554 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2555 | difference is negated before being returned. `zSign' is ignored if the
2556 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2557 | Standard for Binary Floating-Point Arithmetic.
2558 *----------------------------------------------------------------------------*/
2560 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2562 int16 aExp, bExp, zExp;
2563 bits64 aSig, bSig, zSig;
2566 aSig = extractFloat64Frac( a );
2567 aExp = extractFloat64Exp( a );
2568 bSig = extractFloat64Frac( b );
2569 bExp = extractFloat64Exp( b );
2570 expDiff = aExp - bExp;
2573 if ( 0 < expDiff ) goto aExpBigger;
2574 if ( expDiff < 0 ) goto bExpBigger;
2575 if ( aExp == 0x7FF ) {
2576 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577 float_raise( float_flag_invalid STATUS_VAR);
2578 return float64_default_nan;
2584 if ( bSig < aSig ) goto aBigger;
2585 if ( aSig < bSig ) goto bBigger;
2586 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2588 if ( bExp == 0x7FF ) {
2589 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2590 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2596 aSig |= LIT64( 0x4000000000000000 );
2598 shift64RightJamming( aSig, - expDiff, &aSig );
2599 bSig |= LIT64( 0x4000000000000000 );
2604 goto normalizeRoundAndPack;
2606 if ( aExp == 0x7FF ) {
2607 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2614 bSig |= LIT64( 0x4000000000000000 );
2616 shift64RightJamming( bSig, expDiff, &bSig );
2617 aSig |= LIT64( 0x4000000000000000 );
2621 normalizeRoundAndPack:
2623 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of adding the double-precision floating-point values `a'
2629 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2630 | Binary Floating-Point Arithmetic.
2631 *----------------------------------------------------------------------------*/
2633 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2637 aSign = extractFloat64Sign( a );
2638 bSign = extractFloat64Sign( b );
2639 if ( aSign == bSign ) {
2640 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2643 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2648 /*----------------------------------------------------------------------------
2649 | Returns the result of subtracting the double-precision floating-point values
2650 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2651 | for Binary Floating-Point Arithmetic.
2652 *----------------------------------------------------------------------------*/
2654 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2658 aSign = extractFloat64Sign( a );
2659 bSign = extractFloat64Sign( b );
2660 if ( aSign == bSign ) {
2661 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2664 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2669 /*----------------------------------------------------------------------------
2670 | Returns the result of multiplying the double-precision floating-point values
2671 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2672 | for Binary Floating-Point Arithmetic.
2673 *----------------------------------------------------------------------------*/
2675 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2677 flag aSign, bSign, zSign;
2678 int16 aExp, bExp, zExp;
2679 bits64 aSig, bSig, zSig0, zSig1;
2681 aSig = extractFloat64Frac( a );
2682 aExp = extractFloat64Exp( a );
2683 aSign = extractFloat64Sign( a );
2684 bSig = extractFloat64Frac( b );
2685 bExp = extractFloat64Exp( b );
2686 bSign = extractFloat64Sign( b );
2687 zSign = aSign ^ bSign;
2688 if ( aExp == 0x7FF ) {
2689 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2690 return propagateFloat64NaN( a, b STATUS_VAR );
2692 if ( ( bExp | bSig ) == 0 ) {
2693 float_raise( float_flag_invalid STATUS_VAR);
2694 return float64_default_nan;
2696 return packFloat64( zSign, 0x7FF, 0 );
2698 if ( bExp == 0x7FF ) {
2699 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2700 if ( ( aExp | aSig ) == 0 ) {
2701 float_raise( float_flag_invalid STATUS_VAR);
2702 return float64_default_nan;
2704 return packFloat64( zSign, 0x7FF, 0 );
2707 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2708 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2711 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2712 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2714 zExp = aExp + bExp - 0x3FF;
2715 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2716 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2717 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2718 zSig0 |= ( zSig1 != 0 );
2719 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2723 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2727 /*----------------------------------------------------------------------------
2728 | Returns the result of dividing the double-precision floating-point value `a'
2729 | by the corresponding value `b'. The operation is performed according to
2730 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2733 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2735 flag aSign, bSign, zSign;
2736 int16 aExp, bExp, zExp;
2737 bits64 aSig, bSig, zSig;
2739 bits64 term0, term1;
2741 aSig = extractFloat64Frac( a );
2742 aExp = extractFloat64Exp( a );
2743 aSign = extractFloat64Sign( a );
2744 bSig = extractFloat64Frac( b );
2745 bExp = extractFloat64Exp( b );
2746 bSign = extractFloat64Sign( b );
2747 zSign = aSign ^ bSign;
2748 if ( aExp == 0x7FF ) {
2749 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2750 if ( bExp == 0x7FF ) {
2751 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2752 float_raise( float_flag_invalid STATUS_VAR);
2753 return float64_default_nan;
2755 return packFloat64( zSign, 0x7FF, 0 );
2757 if ( bExp == 0x7FF ) {
2758 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2759 return packFloat64( zSign, 0, 0 );
2763 if ( ( aExp | aSig ) == 0 ) {
2764 float_raise( float_flag_invalid STATUS_VAR);
2765 return float64_default_nan;
2767 float_raise( float_flag_divbyzero STATUS_VAR);
2768 return packFloat64( zSign, 0x7FF, 0 );
2770 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2773 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2774 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2776 zExp = aExp - bExp + 0x3FD;
2777 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2778 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2779 if ( bSig <= ( aSig + aSig ) ) {
2783 zSig = estimateDiv128To64( aSig, 0, bSig );
2784 if ( ( zSig & 0x1FF ) <= 2 ) {
2785 mul64To128( bSig, zSig, &term0, &term1 );
2786 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2787 while ( (sbits64) rem0 < 0 ) {
2789 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2791 zSig |= ( rem1 != 0 );
2793 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2797 /*----------------------------------------------------------------------------
2798 | Returns the remainder of the double-precision floating-point value `a'
2799 | with respect to the corresponding value `b'. The operation is performed
2800 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2801 *----------------------------------------------------------------------------*/
2803 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2805 flag aSign, bSign, zSign;
2806 int16 aExp, bExp, expDiff;
2808 bits64 q, alternateASig;
2811 aSig = extractFloat64Frac( a );
2812 aExp = extractFloat64Exp( a );
2813 aSign = extractFloat64Sign( a );
2814 bSig = extractFloat64Frac( b );
2815 bExp = extractFloat64Exp( b );
2816 bSign = extractFloat64Sign( b );
2817 if ( aExp == 0x7FF ) {
2818 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2819 return propagateFloat64NaN( a, b STATUS_VAR );
2821 float_raise( float_flag_invalid STATUS_VAR);
2822 return float64_default_nan;
2824 if ( bExp == 0x7FF ) {
2825 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2830 float_raise( float_flag_invalid STATUS_VAR);
2831 return float64_default_nan;
2833 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2836 if ( aSig == 0 ) return a;
2837 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2839 expDiff = aExp - bExp;
2840 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2841 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842 if ( expDiff < 0 ) {
2843 if ( expDiff < -1 ) return a;
2846 q = ( bSig <= aSig );
2847 if ( q ) aSig -= bSig;
2849 while ( 0 < expDiff ) {
2850 q = estimateDiv128To64( aSig, 0, bSig );
2851 q = ( 2 < q ) ? q - 2 : 0;
2852 aSig = - ( ( bSig>>2 ) * q );
2856 if ( 0 < expDiff ) {
2857 q = estimateDiv128To64( aSig, 0, bSig );
2858 q = ( 2 < q ) ? q - 2 : 0;
2861 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2868 alternateASig = aSig;
2871 } while ( 0 <= (sbits64) aSig );
2872 sigMean = aSig + alternateASig;
2873 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2874 aSig = alternateASig;
2876 zSign = ( (sbits64) aSig < 0 );
2877 if ( zSign ) aSig = - aSig;
2878 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2882 /*----------------------------------------------------------------------------
2883 | Returns the square root of the double-precision floating-point value `a'.
2884 | The operation is performed according to the IEC/IEEE Standard for Binary
2885 | Floating-Point Arithmetic.
2886 *----------------------------------------------------------------------------*/
2888 float64 float64_sqrt( float64 a STATUS_PARAM )
2892 bits64 aSig, zSig, doubleZSig;
2893 bits64 rem0, rem1, term0, term1;
2895 aSig = extractFloat64Frac( a );
2896 aExp = extractFloat64Exp( a );
2897 aSign = extractFloat64Sign( a );
2898 if ( aExp == 0x7FF ) {
2899 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2900 if ( ! aSign ) return a;
2901 float_raise( float_flag_invalid STATUS_VAR);
2902 return float64_default_nan;
2905 if ( ( aExp | aSig ) == 0 ) return a;
2906 float_raise( float_flag_invalid STATUS_VAR);
2907 return float64_default_nan;
2910 if ( aSig == 0 ) return 0;
2911 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2913 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2914 aSig |= LIT64( 0x0010000000000000 );
2915 zSig = estimateSqrt32( aExp, aSig>>21 );
2916 aSig <<= 9 - ( aExp & 1 );
2917 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2918 if ( ( zSig & 0x1FF ) <= 5 ) {
2919 doubleZSig = zSig<<1;
2920 mul64To128( zSig, zSig, &term0, &term1 );
2921 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2922 while ( (sbits64) rem0 < 0 ) {
2925 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2927 zSig |= ( ( rem0 | rem1 ) != 0 );
2929 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2933 /*----------------------------------------------------------------------------
2934 | Returns 1 if the double-precision floating-point value `a' is equal to the
2935 | corresponding value `b', and 0 otherwise. The comparison is performed
2936 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2937 *----------------------------------------------------------------------------*/
2939 flag float64_eq( float64 a, float64 b STATUS_PARAM )
2942 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2943 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2945 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2946 float_raise( float_flag_invalid STATUS_VAR);
2950 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2954 /*----------------------------------------------------------------------------
2955 | Returns 1 if the double-precision floating-point value `a' is less than or
2956 | equal to the corresponding value `b', and 0 otherwise. The comparison is
2957 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2959 *----------------------------------------------------------------------------*/
2961 flag float64_le( float64 a, float64 b STATUS_PARAM )
2965 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2966 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2968 float_raise( float_flag_invalid STATUS_VAR);
2971 aSign = extractFloat64Sign( a );
2972 bSign = extractFloat64Sign( b );
2973 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2974 return ( a == b ) || ( aSign ^ ( a < b ) );
2978 /*----------------------------------------------------------------------------
2979 | Returns 1 if the double-precision floating-point value `a' is less than
2980 | the corresponding value `b', and 0 otherwise. The comparison is performed
2981 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2982 *----------------------------------------------------------------------------*/
2984 flag float64_lt( float64 a, float64 b STATUS_PARAM )
2988 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2989 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2991 float_raise( float_flag_invalid STATUS_VAR);
2994 aSign = extractFloat64Sign( a );
2995 bSign = extractFloat64Sign( b );
2996 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2997 return ( a != b ) && ( aSign ^ ( a < b ) );
3001 /*----------------------------------------------------------------------------
3002 | Returns 1 if the double-precision floating-point value `a' is equal to the
3003 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3004 | if either operand is a NaN. Otherwise, the comparison is performed
3005 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3006 *----------------------------------------------------------------------------*/
3008 flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3011 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3012 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3014 float_raise( float_flag_invalid STATUS_VAR);
3017 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3021 /*----------------------------------------------------------------------------
3022 | Returns 1 if the double-precision floating-point value `a' is less than or
3023 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3024 | cause an exception. Otherwise, the comparison is performed according to the
3025 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026 *----------------------------------------------------------------------------*/
3028 flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3032 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3035 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3036 float_raise( float_flag_invalid STATUS_VAR);
3040 aSign = extractFloat64Sign( a );
3041 bSign = extractFloat64Sign( b );
3042 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3043 return ( a == b ) || ( aSign ^ ( a < b ) );
3047 /*----------------------------------------------------------------------------
3048 | Returns 1 if the double-precision floating-point value `a' is less than
3049 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3050 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3051 | Standard for Binary Floating-Point Arithmetic.
3052 *----------------------------------------------------------------------------*/
3054 flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3058 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3061 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3062 float_raise( float_flag_invalid STATUS_VAR);
3066 aSign = extractFloat64Sign( a );
3067 bSign = extractFloat64Sign( b );
3068 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3069 return ( a != b ) && ( aSign ^ ( a < b ) );
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the extended double-precision floating-
3077 | point value `a' to the 32-bit two's complement integer format. The
3078 | conversion is performed according to the IEC/IEEE Standard for Binary
3079 | Floating-Point Arithmetic---which means in particular that the conversion
3080 | is rounded according to the current rounding mode. If `a' is a NaN, the
3081 | largest positive integer is returned. Otherwise, if the conversion
3082 | overflows, the largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3085 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3088 int32 aExp, shiftCount;
3091 aSig = extractFloatx80Frac( a );
3092 aExp = extractFloatx80Exp( a );
3093 aSign = extractFloatx80Sign( a );
3094 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3095 shiftCount = 0x4037 - aExp;
3096 if ( shiftCount <= 0 ) shiftCount = 1;
3097 shift64RightJamming( aSig, shiftCount, &aSig );
3098 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3102 /*----------------------------------------------------------------------------
3103 | Returns the result of converting the extended double-precision floating-
3104 | point value `a' to the 32-bit two's complement integer format. The
3105 | conversion is performed according to the IEC/IEEE Standard for Binary
3106 | Floating-Point Arithmetic, except that the conversion is always rounded
3107 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3108 | Otherwise, if the conversion overflows, the largest integer with the same
3109 | sign as `a' is returned.
3110 *----------------------------------------------------------------------------*/
3112 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3115 int32 aExp, shiftCount;
3116 bits64 aSig, savedASig;
3119 aSig = extractFloatx80Frac( a );
3120 aExp = extractFloatx80Exp( a );
3121 aSign = extractFloatx80Sign( a );
3122 if ( 0x401E < aExp ) {
3123 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3126 else if ( aExp < 0x3FFF ) {
3127 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3130 shiftCount = 0x403E - aExp;
3132 aSig >>= shiftCount;
3134 if ( aSign ) z = - z;
3135 if ( ( z < 0 ) ^ aSign ) {
3137 float_raise( float_flag_invalid STATUS_VAR);
3138 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3140 if ( ( aSig<<shiftCount ) != savedASig ) {
3141 STATUS(float_exception_flags) |= float_flag_inexact;
3147 /*----------------------------------------------------------------------------
3148 | Returns the result of converting the extended double-precision floating-
3149 | point value `a' to the 64-bit two's complement integer format. The
3150 | conversion is performed according to the IEC/IEEE Standard for Binary
3151 | Floating-Point Arithmetic---which means in particular that the conversion
3152 | is rounded according to the current rounding mode. If `a' is a NaN,
3153 | the largest positive integer is returned. Otherwise, if the conversion
3154 | overflows, the largest integer with the same sign as `a' is returned.
3155 *----------------------------------------------------------------------------*/
3157 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3160 int32 aExp, shiftCount;
3161 bits64 aSig, aSigExtra;
3163 aSig = extractFloatx80Frac( a );
3164 aExp = extractFloatx80Exp( a );
3165 aSign = extractFloatx80Sign( a );
3166 shiftCount = 0x403E - aExp;
3167 if ( shiftCount <= 0 ) {
3169 float_raise( float_flag_invalid STATUS_VAR);
3171 || ( ( aExp == 0x7FFF )
3172 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3174 return LIT64( 0x7FFFFFFFFFFFFFFF );
3176 return (sbits64) LIT64( 0x8000000000000000 );
3181 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3183 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of converting the extended double-precision floating-
3189 | point value `a' to the 64-bit two's complement integer format. The
3190 | conversion is performed according to the IEC/IEEE Standard for Binary
3191 | Floating-Point Arithmetic, except that the conversion is always rounded
3192 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3193 | Otherwise, if the conversion overflows, the largest integer with the same
3194 | sign as `a' is returned.
3195 *----------------------------------------------------------------------------*/
3197 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3200 int32 aExp, shiftCount;
3204 aSig = extractFloatx80Frac( a );
3205 aExp = extractFloatx80Exp( a );
3206 aSign = extractFloatx80Sign( a );
3207 shiftCount = aExp - 0x403E;
3208 if ( 0 <= shiftCount ) {
3209 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3210 if ( ( a.high != 0xC03E ) || aSig ) {
3211 float_raise( float_flag_invalid STATUS_VAR);
3212 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3213 return LIT64( 0x7FFFFFFFFFFFFFFF );
3216 return (sbits64) LIT64( 0x8000000000000000 );
3218 else if ( aExp < 0x3FFF ) {
3219 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3222 z = aSig>>( - shiftCount );
3223 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3224 STATUS(float_exception_flags) |= float_flag_inexact;
3226 if ( aSign ) z = - z;
3231 /*----------------------------------------------------------------------------
3232 | Returns the result of converting the extended double-precision floating-
3233 | point value `a' to the single-precision floating-point format. The
3234 | conversion is performed according to the IEC/IEEE Standard for Binary
3235 | Floating-Point Arithmetic.
3236 *----------------------------------------------------------------------------*/
3238 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3244 aSig = extractFloatx80Frac( a );
3245 aExp = extractFloatx80Exp( a );
3246 aSign = extractFloatx80Sign( a );
3247 if ( aExp == 0x7FFF ) {
3248 if ( (bits64) ( aSig<<1 ) ) {
3249 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3251 return packFloat32( aSign, 0xFF, 0 );
3253 shift64RightJamming( aSig, 33, &aSig );
3254 if ( aExp || aSig ) aExp -= 0x3F81;
3255 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3259 /*----------------------------------------------------------------------------
3260 | Returns the result of converting the extended double-precision floating-
3261 | point value `a' to the double-precision floating-point format. The
3262 | conversion is performed according to the IEC/IEEE Standard for Binary
3263 | Floating-Point Arithmetic.
3264 *----------------------------------------------------------------------------*/
3266 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3272 aSig = extractFloatx80Frac( a );
3273 aExp = extractFloatx80Exp( a );
3274 aSign = extractFloatx80Sign( a );
3275 if ( aExp == 0x7FFF ) {
3276 if ( (bits64) ( aSig<<1 ) ) {
3277 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3279 return packFloat64( aSign, 0x7FF, 0 );
3281 shift64RightJamming( aSig, 1, &zSig );
3282 if ( aExp || aSig ) aExp -= 0x3C01;
3283 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3289 /*----------------------------------------------------------------------------
3290 | Returns the result of converting the extended double-precision floating-
3291 | point value `a' to the quadruple-precision floating-point format. The
3292 | conversion is performed according to the IEC/IEEE Standard for Binary
3293 | Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3296 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3300 bits64 aSig, zSig0, zSig1;
3302 aSig = extractFloatx80Frac( a );
3303 aExp = extractFloatx80Exp( a );
3304 aSign = extractFloatx80Sign( a );
3305 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3306 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3308 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3309 return packFloat128( aSign, aExp, zSig0, zSig1 );
3315 /*----------------------------------------------------------------------------
3316 | Rounds the extended double-precision floating-point value `a' to an integer,
3317 | and returns the result as an extended quadruple-precision floating-point
3318 | value. The operation is performed according to the IEC/IEEE Standard for
3319 | Binary Floating-Point Arithmetic.
3320 *----------------------------------------------------------------------------*/
3322 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3326 bits64 lastBitMask, roundBitsMask;
3330 aExp = extractFloatx80Exp( a );
3331 if ( 0x403E <= aExp ) {
3332 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3333 return propagateFloatx80NaN( a, a STATUS_VAR );
3337 if ( aExp < 0x3FFF ) {
3339 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3342 STATUS(float_exception_flags) |= float_flag_inexact;
3343 aSign = extractFloatx80Sign( a );
3344 switch ( STATUS(float_rounding_mode) ) {
3345 case float_round_nearest_even:
3346 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3349 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3352 case float_round_down:
3355 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3356 : packFloatx80( 0, 0, 0 );
3357 case float_round_up:
3359 aSign ? packFloatx80( 1, 0, 0 )
3360 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3362 return packFloatx80( aSign, 0, 0 );
3365 lastBitMask <<= 0x403E - aExp;
3366 roundBitsMask = lastBitMask - 1;
3368 roundingMode = STATUS(float_rounding_mode);
3369 if ( roundingMode == float_round_nearest_even ) {
3370 z.low += lastBitMask>>1;
3371 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3373 else if ( roundingMode != float_round_to_zero ) {
3374 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3375 z.low += roundBitsMask;
3378 z.low &= ~ roundBitsMask;
3381 z.low = LIT64( 0x8000000000000000 );
3383 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3388 /*----------------------------------------------------------------------------
3389 | Returns the result of adding the absolute values of the extended double-
3390 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3391 | negated before being returned. `zSign' is ignored if the result is a NaN.
3392 | The addition is performed according to the IEC/IEEE Standard for Binary
3393 | Floating-Point Arithmetic.
3394 *----------------------------------------------------------------------------*/
3396 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3398 int32 aExp, bExp, zExp;
3399 bits64 aSig, bSig, zSig0, zSig1;
3402 aSig = extractFloatx80Frac( a );
3403 aExp = extractFloatx80Exp( a );
3404 bSig = extractFloatx80Frac( b );
3405 bExp = extractFloatx80Exp( b );
3406 expDiff = aExp - bExp;
3407 if ( 0 < expDiff ) {
3408 if ( aExp == 0x7FFF ) {
3409 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3412 if ( bExp == 0 ) --expDiff;
3413 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3416 else if ( expDiff < 0 ) {
3417 if ( bExp == 0x7FFF ) {
3418 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3419 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3421 if ( aExp == 0 ) ++expDiff;
3422 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3426 if ( aExp == 0x7FFF ) {
3427 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3428 return propagateFloatx80NaN( a, b STATUS_VAR );
3433 zSig0 = aSig + bSig;
3435 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3441 zSig0 = aSig + bSig;
3442 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3444 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3445 zSig0 |= LIT64( 0x8000000000000000 );
3449 roundAndPackFloatx80(
3450 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3454 /*----------------------------------------------------------------------------
3455 | Returns the result of subtracting the absolute values of the extended
3456 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3457 | difference is negated before being returned. `zSign' is ignored if the
3458 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3459 | Standard for Binary Floating-Point Arithmetic.
3460 *----------------------------------------------------------------------------*/
3462 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3464 int32 aExp, bExp, zExp;
3465 bits64 aSig, bSig, zSig0, zSig1;
3469 aSig = extractFloatx80Frac( a );
3470 aExp = extractFloatx80Exp( a );
3471 bSig = extractFloatx80Frac( b );
3472 bExp = extractFloatx80Exp( b );
3473 expDiff = aExp - bExp;
3474 if ( 0 < expDiff ) goto aExpBigger;
3475 if ( expDiff < 0 ) goto bExpBigger;
3476 if ( aExp == 0x7FFF ) {
3477 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3478 return propagateFloatx80NaN( a, b STATUS_VAR );
3480 float_raise( float_flag_invalid STATUS_VAR);
3481 z.low = floatx80_default_nan_low;
3482 z.high = floatx80_default_nan_high;
3490 if ( bSig < aSig ) goto aBigger;
3491 if ( aSig < bSig ) goto bBigger;
3492 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3494 if ( bExp == 0x7FFF ) {
3495 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3496 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3498 if ( aExp == 0 ) ++expDiff;
3499 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3501 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3504 goto normalizeRoundAndPack;
3506 if ( aExp == 0x7FFF ) {
3507 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3510 if ( bExp == 0 ) --expDiff;
3511 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3513 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3515 normalizeRoundAndPack:
3517 normalizeRoundAndPackFloatx80(
3518 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3522 /*----------------------------------------------------------------------------
3523 | Returns the result of adding the extended double-precision floating-point
3524 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3525 | Standard for Binary Floating-Point Arithmetic.
3526 *----------------------------------------------------------------------------*/
3528 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3532 aSign = extractFloatx80Sign( a );
3533 bSign = extractFloatx80Sign( b );
3534 if ( aSign == bSign ) {
3535 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3538 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3543 /*----------------------------------------------------------------------------
3544 | Returns the result of subtracting the extended double-precision floating-
3545 | point values `a' and `b'. The operation is performed according to the
3546 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3547 *----------------------------------------------------------------------------*/
3549 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3553 aSign = extractFloatx80Sign( a );
3554 bSign = extractFloatx80Sign( b );
3555 if ( aSign == bSign ) {
3556 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3559 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3564 /*----------------------------------------------------------------------------
3565 | Returns the result of multiplying the extended double-precision floating-
3566 | point values `a' and `b'. The operation is performed according to the
3567 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3568 *----------------------------------------------------------------------------*/
3570 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3572 flag aSign, bSign, zSign;
3573 int32 aExp, bExp, zExp;
3574 bits64 aSig, bSig, zSig0, zSig1;
3577 aSig = extractFloatx80Frac( a );
3578 aExp = extractFloatx80Exp( a );
3579 aSign = extractFloatx80Sign( a );
3580 bSig = extractFloatx80Frac( b );
3581 bExp = extractFloatx80Exp( b );
3582 bSign = extractFloatx80Sign( b );
3583 zSign = aSign ^ bSign;
3584 if ( aExp == 0x7FFF ) {
3585 if ( (bits64) ( aSig<<1 )
3586 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3587 return propagateFloatx80NaN( a, b STATUS_VAR );
3589 if ( ( bExp | bSig ) == 0 ) goto invalid;
3590 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3592 if ( bExp == 0x7FFF ) {
3593 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3594 if ( ( aExp | aSig ) == 0 ) {
3596 float_raise( float_flag_invalid STATUS_VAR);
3597 z.low = floatx80_default_nan_low;
3598 z.high = floatx80_default_nan_high;
3601 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3604 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3605 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3608 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3609 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3611 zExp = aExp + bExp - 0x3FFE;
3612 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3613 if ( 0 < (sbits64) zSig0 ) {
3614 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618 roundAndPackFloatx80(
3619 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3623 /*----------------------------------------------------------------------------
3624 | Returns the result of dividing the extended double-precision floating-point
3625 | value `a' by the corresponding value `b'. The operation is performed
3626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3627 *----------------------------------------------------------------------------*/
3629 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3631 flag aSign, bSign, zSign;
3632 int32 aExp, bExp, zExp;
3633 bits64 aSig, bSig, zSig0, zSig1;
3634 bits64 rem0, rem1, rem2, term0, term1, term2;
3637 aSig = extractFloatx80Frac( a );
3638 aExp = extractFloatx80Exp( a );
3639 aSign = extractFloatx80Sign( a );
3640 bSig = extractFloatx80Frac( b );
3641 bExp = extractFloatx80Exp( b );
3642 bSign = extractFloatx80Sign( b );
3643 zSign = aSign ^ bSign;
3644 if ( aExp == 0x7FFF ) {
3645 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3646 if ( bExp == 0x7FFF ) {
3647 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3650 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3652 if ( bExp == 0x7FFF ) {
3653 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3654 return packFloatx80( zSign, 0, 0 );
3658 if ( ( aExp | aSig ) == 0 ) {
3660 float_raise( float_flag_invalid STATUS_VAR);
3661 z.low = floatx80_default_nan_low;
3662 z.high = floatx80_default_nan_high;
3665 float_raise( float_flag_divbyzero STATUS_VAR);
3666 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3668 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3671 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3672 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3674 zExp = aExp - bExp + 0x3FFE;
3676 if ( bSig <= aSig ) {
3677 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3680 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3681 mul64To128( bSig, zSig0, &term0, &term1 );
3682 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3683 while ( (sbits64) rem0 < 0 ) {
3685 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3687 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3688 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3689 mul64To128( bSig, zSig1, &term1, &term2 );
3690 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3691 while ( (sbits64) rem1 < 0 ) {
3693 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3695 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3698 roundAndPackFloatx80(
3699 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3703 /*----------------------------------------------------------------------------
3704 | Returns the remainder of the extended double-precision floating-point value
3705 | `a' with respect to the corresponding value `b'. The operation is performed
3706 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3707 *----------------------------------------------------------------------------*/
3709 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3711 flag aSign, bSign, zSign;
3712 int32 aExp, bExp, expDiff;
3713 bits64 aSig0, aSig1, bSig;
3714 bits64 q, term0, term1, alternateASig0, alternateASig1;
3717 aSig0 = extractFloatx80Frac( a );
3718 aExp = extractFloatx80Exp( a );
3719 aSign = extractFloatx80Sign( a );
3720 bSig = extractFloatx80Frac( b );
3721 bExp = extractFloatx80Exp( b );
3722 bSign = extractFloatx80Sign( b );
3723 if ( aExp == 0x7FFF ) {
3724 if ( (bits64) ( aSig0<<1 )
3725 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3726 return propagateFloatx80NaN( a, b STATUS_VAR );
3730 if ( bExp == 0x7FFF ) {
3731 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3737 float_raise( float_flag_invalid STATUS_VAR);
3738 z.low = floatx80_default_nan_low;
3739 z.high = floatx80_default_nan_high;
3742 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3745 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3746 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3748 bSig |= LIT64( 0x8000000000000000 );
3750 expDiff = aExp - bExp;
3752 if ( expDiff < 0 ) {
3753 if ( expDiff < -1 ) return a;
3754 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3757 q = ( bSig <= aSig0 );
3758 if ( q ) aSig0 -= bSig;
3760 while ( 0 < expDiff ) {
3761 q = estimateDiv128To64( aSig0, aSig1, bSig );
3762 q = ( 2 < q ) ? q - 2 : 0;
3763 mul64To128( bSig, q, &term0, &term1 );
3764 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3765 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3769 if ( 0 < expDiff ) {
3770 q = estimateDiv128To64( aSig0, aSig1, bSig );
3771 q = ( 2 < q ) ? q - 2 : 0;
3773 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3774 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3775 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3776 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3778 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3785 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3786 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3787 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3790 aSig0 = alternateASig0;
3791 aSig1 = alternateASig1;
3795 normalizeRoundAndPackFloatx80(
3796 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3800 /*----------------------------------------------------------------------------
3801 | Returns the square root of the extended double-precision floating-point
3802 | value `a'. The operation is performed according to the IEC/IEEE Standard
3803 | for Binary Floating-Point Arithmetic.
3804 *----------------------------------------------------------------------------*/
3806 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3810 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3811 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3814 aSig0 = extractFloatx80Frac( a );
3815 aExp = extractFloatx80Exp( a );
3816 aSign = extractFloatx80Sign( a );
3817 if ( aExp == 0x7FFF ) {
3818 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3819 if ( ! aSign ) return a;
3823 if ( ( aExp | aSig0 ) == 0 ) return a;
3825 float_raise( float_flag_invalid STATUS_VAR);
3826 z.low = floatx80_default_nan_low;
3827 z.high = floatx80_default_nan_high;
3831 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3832 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3834 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3835 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3836 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3837 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3838 doubleZSig0 = zSig0<<1;
3839 mul64To128( zSig0, zSig0, &term0, &term1 );
3840 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3841 while ( (sbits64) rem0 < 0 ) {
3844 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3846 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3847 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3848 if ( zSig1 == 0 ) zSig1 = 1;
3849 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3850 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3851 mul64To128( zSig1, zSig1, &term2, &term3 );
3852 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3853 while ( (sbits64) rem1 < 0 ) {
3855 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3857 term2 |= doubleZSig0;
3858 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3860 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3862 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3863 zSig0 |= doubleZSig0;
3865 roundAndPackFloatx80(
3866 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3870 /*----------------------------------------------------------------------------
3871 | Returns 1 if the extended double-precision floating-point value `a' is
3872 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3873 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3875 *----------------------------------------------------------------------------*/
3877 flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3880 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3881 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3882 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3883 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3885 if ( floatx80_is_signaling_nan( a )
3886 || floatx80_is_signaling_nan( b ) ) {
3887 float_raise( float_flag_invalid STATUS_VAR);
3893 && ( ( a.high == b.high )
3895 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3900 /*----------------------------------------------------------------------------
3901 | Returns 1 if the extended double-precision floating-point value `a' is
3902 | less than or equal to the corresponding value `b', and 0 otherwise. The
3903 | comparison is performed according to the IEC/IEEE Standard for Binary
3904 | Floating-Point Arithmetic.
3905 *----------------------------------------------------------------------------*/
3907 flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3911 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3912 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3913 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3914 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3916 float_raise( float_flag_invalid STATUS_VAR);
3919 aSign = extractFloatx80Sign( a );
3920 bSign = extractFloatx80Sign( b );
3921 if ( aSign != bSign ) {
3924 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3928 aSign ? le128( b.high, b.low, a.high, a.low )
3929 : le128( a.high, a.low, b.high, b.low );
3933 /*----------------------------------------------------------------------------
3934 | Returns 1 if the extended double-precision floating-point value `a' is
3935 | less than the corresponding value `b', and 0 otherwise. The comparison
3936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3938 *----------------------------------------------------------------------------*/
3940 flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3944 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3945 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3946 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3947 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3949 float_raise( float_flag_invalid STATUS_VAR);
3952 aSign = extractFloatx80Sign( a );
3953 bSign = extractFloatx80Sign( b );
3954 if ( aSign != bSign ) {
3957 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3961 aSign ? lt128( b.high, b.low, a.high, a.low )
3962 : lt128( a.high, a.low, b.high, b.low );
3966 /*----------------------------------------------------------------------------
3967 | Returns 1 if the extended double-precision floating-point value `a' is equal
3968 | to the corresponding value `b', and 0 otherwise. The invalid exception is
3969 | raised if either operand is a NaN. Otherwise, the comparison is performed
3970 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3973 flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3976 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3977 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3978 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3979 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3981 float_raise( float_flag_invalid STATUS_VAR);
3986 && ( ( a.high == b.high )
3988 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the extended double-precision floating-point value `a' is less
3995 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3996 | do not cause an exception. Otherwise, the comparison is performed according
3997 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
4000 flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4004 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4005 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4006 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4007 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4009 if ( floatx80_is_signaling_nan( a )
4010 || floatx80_is_signaling_nan( b ) ) {
4011 float_raise( float_flag_invalid STATUS_VAR);
4015 aSign = extractFloatx80Sign( a );
4016 bSign = extractFloatx80Sign( b );
4017 if ( aSign != bSign ) {
4020 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4024 aSign ? le128( b.high, b.low, a.high, a.low )
4025 : le128( a.high, a.low, b.high, b.low );
4029 /*----------------------------------------------------------------------------
4030 | Returns 1 if the extended double-precision floating-point value `a' is less
4031 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4032 | an exception. Otherwise, the comparison is performed according to the
4033 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4034 *----------------------------------------------------------------------------*/
4036 flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4040 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4041 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4042 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4043 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4045 if ( floatx80_is_signaling_nan( a )
4046 || floatx80_is_signaling_nan( b ) ) {
4047 float_raise( float_flag_invalid STATUS_VAR);
4051 aSign = extractFloatx80Sign( a );
4052 bSign = extractFloatx80Sign( b );
4053 if ( aSign != bSign ) {
4056 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4060 aSign ? lt128( b.high, b.low, a.high, a.low )
4061 : lt128( a.high, a.low, b.high, b.low );
4069 /*----------------------------------------------------------------------------
4070 | Returns the result of converting the quadruple-precision floating-point
4071 | value `a' to the 32-bit two's complement integer format. The conversion
4072 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4073 | Arithmetic---which means in particular that the conversion is rounded
4074 | according to the current rounding mode. If `a' is a NaN, the largest
4075 | positive integer is returned. Otherwise, if the conversion overflows, the
4076 | largest integer with the same sign as `a' is returned.
4077 *----------------------------------------------------------------------------*/
4079 int32 float128_to_int32( float128 a STATUS_PARAM )
4082 int32 aExp, shiftCount;
4083 bits64 aSig0, aSig1;
4085 aSig1 = extractFloat128Frac1( a );
4086 aSig0 = extractFloat128Frac0( a );
4087 aExp = extractFloat128Exp( a );
4088 aSign = extractFloat128Sign( a );
4089 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4090 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4091 aSig0 |= ( aSig1 != 0 );
4092 shiftCount = 0x4028 - aExp;
4093 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4094 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4098 /*----------------------------------------------------------------------------
4099 | Returns the result of converting the quadruple-precision floating-point
4100 | value `a' to the 32-bit two's complement integer format. The conversion
4101 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4102 | Arithmetic, except that the conversion is always rounded toward zero. If
4103 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4104 | conversion overflows, the largest integer with the same sign as `a' is
4106 *----------------------------------------------------------------------------*/
4108 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4111 int32 aExp, shiftCount;
4112 bits64 aSig0, aSig1, savedASig;
4115 aSig1 = extractFloat128Frac1( a );
4116 aSig0 = extractFloat128Frac0( a );
4117 aExp = extractFloat128Exp( a );
4118 aSign = extractFloat128Sign( a );
4119 aSig0 |= ( aSig1 != 0 );
4120 if ( 0x401E < aExp ) {
4121 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4124 else if ( aExp < 0x3FFF ) {
4125 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4128 aSig0 |= LIT64( 0x0001000000000000 );
4129 shiftCount = 0x402F - aExp;
4131 aSig0 >>= shiftCount;
4133 if ( aSign ) z = - z;
4134 if ( ( z < 0 ) ^ aSign ) {
4136 float_raise( float_flag_invalid STATUS_VAR);
4137 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4139 if ( ( aSig0<<shiftCount ) != savedASig ) {
4140 STATUS(float_exception_flags) |= float_flag_inexact;
4146 /*----------------------------------------------------------------------------
4147 | Returns the result of converting the quadruple-precision floating-point
4148 | value `a' to the 64-bit two's complement integer format. The conversion
4149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4150 | Arithmetic---which means in particular that the conversion is rounded
4151 | according to the current rounding mode. If `a' is a NaN, the largest
4152 | positive integer is returned. Otherwise, if the conversion overflows, the
4153 | largest integer with the same sign as `a' is returned.
4154 *----------------------------------------------------------------------------*/
4156 int64 float128_to_int64( float128 a STATUS_PARAM )
4159 int32 aExp, shiftCount;
4160 bits64 aSig0, aSig1;
4162 aSig1 = extractFloat128Frac1( a );
4163 aSig0 = extractFloat128Frac0( a );
4164 aExp = extractFloat128Exp( a );
4165 aSign = extractFloat128Sign( a );
4166 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4167 shiftCount = 0x402F - aExp;
4168 if ( shiftCount <= 0 ) {
4169 if ( 0x403E < aExp ) {
4170 float_raise( float_flag_invalid STATUS_VAR);
4172 || ( ( aExp == 0x7FFF )
4173 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4176 return LIT64( 0x7FFFFFFFFFFFFFFF );
4178 return (sbits64) LIT64( 0x8000000000000000 );
4180 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4183 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4185 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4189 /*----------------------------------------------------------------------------
4190 | Returns the result of converting the quadruple-precision floating-point
4191 | value `a' to the 64-bit two's complement integer format. The conversion
4192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4193 | Arithmetic, except that the conversion is always rounded toward zero.
4194 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4195 | the conversion overflows, the largest integer with the same sign as `a' is
4197 *----------------------------------------------------------------------------*/
4199 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4202 int32 aExp, shiftCount;
4203 bits64 aSig0, aSig1;
4206 aSig1 = extractFloat128Frac1( a );
4207 aSig0 = extractFloat128Frac0( a );
4208 aExp = extractFloat128Exp( a );
4209 aSign = extractFloat128Sign( a );
4210 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4211 shiftCount = aExp - 0x402F;
4212 if ( 0 < shiftCount ) {
4213 if ( 0x403E <= aExp ) {
4214 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4215 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4216 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4217 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4220 float_raise( float_flag_invalid STATUS_VAR);
4221 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4222 return LIT64( 0x7FFFFFFFFFFFFFFF );
4225 return (sbits64) LIT64( 0x8000000000000000 );
4227 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4228 if ( (bits64) ( aSig1<<shiftCount ) ) {
4229 STATUS(float_exception_flags) |= float_flag_inexact;
4233 if ( aExp < 0x3FFF ) {
4234 if ( aExp | aSig0 | aSig1 ) {
4235 STATUS(float_exception_flags) |= float_flag_inexact;
4239 z = aSig0>>( - shiftCount );
4241 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4242 STATUS(float_exception_flags) |= float_flag_inexact;
4245 if ( aSign ) z = - z;
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of converting the quadruple-precision floating-point
4252 | value `a' to the single-precision floating-point format. The conversion
4253 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4255 *----------------------------------------------------------------------------*/
4257 float32 float128_to_float32( float128 a STATUS_PARAM )
4261 bits64 aSig0, aSig1;
4264 aSig1 = extractFloat128Frac1( a );
4265 aSig0 = extractFloat128Frac0( a );
4266 aExp = extractFloat128Exp( a );
4267 aSign = extractFloat128Sign( a );
4268 if ( aExp == 0x7FFF ) {
4269 if ( aSig0 | aSig1 ) {
4270 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4272 return packFloat32( aSign, 0xFF, 0 );
4274 aSig0 |= ( aSig1 != 0 );
4275 shift64RightJamming( aSig0, 18, &aSig0 );
4277 if ( aExp || zSig ) {
4281 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4285 /*----------------------------------------------------------------------------
4286 | Returns the result of converting the quadruple-precision floating-point
4287 | value `a' to the double-precision floating-point format. The conversion
4288 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4290 *----------------------------------------------------------------------------*/
4292 float64 float128_to_float64( float128 a STATUS_PARAM )
4296 bits64 aSig0, aSig1;
4298 aSig1 = extractFloat128Frac1( a );
4299 aSig0 = extractFloat128Frac0( a );
4300 aExp = extractFloat128Exp( a );
4301 aSign = extractFloat128Sign( a );
4302 if ( aExp == 0x7FFF ) {
4303 if ( aSig0 | aSig1 ) {
4304 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4306 return packFloat64( aSign, 0x7FF, 0 );
4308 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4309 aSig0 |= ( aSig1 != 0 );
4310 if ( aExp || aSig0 ) {
4311 aSig0 |= LIT64( 0x4000000000000000 );
4314 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4320 /*----------------------------------------------------------------------------
4321 | Returns the result of converting the quadruple-precision floating-point
4322 | value `a' to the extended double-precision floating-point format. The
4323 | conversion is performed according to the IEC/IEEE Standard for Binary
4324 | Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4327 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4331 bits64 aSig0, aSig1;
4333 aSig1 = extractFloat128Frac1( a );
4334 aSig0 = extractFloat128Frac0( a );
4335 aExp = extractFloat128Exp( a );
4336 aSign = extractFloat128Sign( a );
4337 if ( aExp == 0x7FFF ) {
4338 if ( aSig0 | aSig1 ) {
4339 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4341 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4344 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4345 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4348 aSig0 |= LIT64( 0x0001000000000000 );
4350 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4351 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4357 /*----------------------------------------------------------------------------
4358 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4359 | returns the result as a quadruple-precision floating-point value. The
4360 | operation is performed according to the IEC/IEEE Standard for Binary
4361 | Floating-Point Arithmetic.
4362 *----------------------------------------------------------------------------*/
4364 float128 float128_round_to_int( float128 a STATUS_PARAM )
4368 bits64 lastBitMask, roundBitsMask;
4372 aExp = extractFloat128Exp( a );
4373 if ( 0x402F <= aExp ) {
4374 if ( 0x406F <= aExp ) {
4375 if ( ( aExp == 0x7FFF )
4376 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4378 return propagateFloat128NaN( a, a STATUS_VAR );
4383 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4384 roundBitsMask = lastBitMask - 1;
4386 roundingMode = STATUS(float_rounding_mode);
4387 if ( roundingMode == float_round_nearest_even ) {
4388 if ( lastBitMask ) {
4389 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4390 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4393 if ( (sbits64) z.low < 0 ) {
4395 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4399 else if ( roundingMode != float_round_to_zero ) {
4400 if ( extractFloat128Sign( z )
4401 ^ ( roundingMode == float_round_up ) ) {
4402 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4405 z.low &= ~ roundBitsMask;
4408 if ( aExp < 0x3FFF ) {
4409 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4410 STATUS(float_exception_flags) |= float_flag_inexact;
4411 aSign = extractFloat128Sign( a );
4412 switch ( STATUS(float_rounding_mode) ) {
4413 case float_round_nearest_even:
4414 if ( ( aExp == 0x3FFE )
4415 && ( extractFloat128Frac0( a )
4416 | extractFloat128Frac1( a ) )
4418 return packFloat128( aSign, 0x3FFF, 0, 0 );
4421 case float_round_down:
4423 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4424 : packFloat128( 0, 0, 0, 0 );
4425 case float_round_up:
4427 aSign ? packFloat128( 1, 0, 0, 0 )
4428 : packFloat128( 0, 0x3FFF, 0, 0 );
4430 return packFloat128( aSign, 0, 0, 0 );
4433 lastBitMask <<= 0x402F - aExp;
4434 roundBitsMask = lastBitMask - 1;
4437 roundingMode = STATUS(float_rounding_mode);
4438 if ( roundingMode == float_round_nearest_even ) {
4439 z.high += lastBitMask>>1;
4440 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4441 z.high &= ~ lastBitMask;
4444 else if ( roundingMode != float_round_to_zero ) {
4445 if ( extractFloat128Sign( z )
4446 ^ ( roundingMode == float_round_up ) ) {
4447 z.high |= ( a.low != 0 );
4448 z.high += roundBitsMask;
4451 z.high &= ~ roundBitsMask;
4453 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4454 STATUS(float_exception_flags) |= float_flag_inexact;
4460 /*----------------------------------------------------------------------------
4461 | Returns the result of adding the absolute values of the quadruple-precision
4462 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4463 | before being returned. `zSign' is ignored if the result is a NaN.
4464 | The addition is performed according to the IEC/IEEE Standard for Binary
4465 | Floating-Point Arithmetic.
4466 *----------------------------------------------------------------------------*/
4468 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4470 int32 aExp, bExp, zExp;
4471 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4474 aSig1 = extractFloat128Frac1( a );
4475 aSig0 = extractFloat128Frac0( a );
4476 aExp = extractFloat128Exp( a );
4477 bSig1 = extractFloat128Frac1( b );
4478 bSig0 = extractFloat128Frac0( b );
4479 bExp = extractFloat128Exp( b );
4480 expDiff = aExp - bExp;
4481 if ( 0 < expDiff ) {
4482 if ( aExp == 0x7FFF ) {
4483 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4490 bSig0 |= LIT64( 0x0001000000000000 );
4492 shift128ExtraRightJamming(
4493 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4496 else if ( expDiff < 0 ) {
4497 if ( bExp == 0x7FFF ) {
4498 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4499 return packFloat128( zSign, 0x7FFF, 0, 0 );
4505 aSig0 |= LIT64( 0x0001000000000000 );
4507 shift128ExtraRightJamming(
4508 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4512 if ( aExp == 0x7FFF ) {
4513 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4514 return propagateFloat128NaN( a, b STATUS_VAR );
4518 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4519 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4521 zSig0 |= LIT64( 0x0002000000000000 );
4525 aSig0 |= LIT64( 0x0001000000000000 );
4526 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4528 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4531 shift128ExtraRightJamming(
4532 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4534 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4538 /*----------------------------------------------------------------------------
4539 | Returns the result of subtracting the absolute values of the quadruple-
4540 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4541 | difference is negated before being returned. `zSign' is ignored if the
4542 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4543 | Standard for Binary Floating-Point Arithmetic.
4544 *----------------------------------------------------------------------------*/
4546 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4548 int32 aExp, bExp, zExp;
4549 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4553 aSig1 = extractFloat128Frac1( a );
4554 aSig0 = extractFloat128Frac0( a );
4555 aExp = extractFloat128Exp( a );
4556 bSig1 = extractFloat128Frac1( b );
4557 bSig0 = extractFloat128Frac0( b );
4558 bExp = extractFloat128Exp( b );
4559 expDiff = aExp - bExp;
4560 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4561 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4562 if ( 0 < expDiff ) goto aExpBigger;
4563 if ( expDiff < 0 ) goto bExpBigger;
4564 if ( aExp == 0x7FFF ) {
4565 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4566 return propagateFloat128NaN( a, b STATUS_VAR );
4568 float_raise( float_flag_invalid STATUS_VAR);
4569 z.low = float128_default_nan_low;
4570 z.high = float128_default_nan_high;
4577 if ( bSig0 < aSig0 ) goto aBigger;
4578 if ( aSig0 < bSig0 ) goto bBigger;
4579 if ( bSig1 < aSig1 ) goto aBigger;
4580 if ( aSig1 < bSig1 ) goto bBigger;
4581 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4583 if ( bExp == 0x7FFF ) {
4584 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4585 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4591 aSig0 |= LIT64( 0x4000000000000000 );
4593 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4594 bSig0 |= LIT64( 0x4000000000000000 );
4596 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4599 goto normalizeRoundAndPack;
4601 if ( aExp == 0x7FFF ) {
4602 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4609 bSig0 |= LIT64( 0x4000000000000000 );
4611 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4612 aSig0 |= LIT64( 0x4000000000000000 );
4614 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4616 normalizeRoundAndPack:
4618 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4622 /*----------------------------------------------------------------------------
4623 | Returns the result of adding the quadruple-precision floating-point values
4624 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4625 | for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4632 aSign = extractFloat128Sign( a );
4633 bSign = extractFloat128Sign( b );
4634 if ( aSign == bSign ) {
4635 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4638 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4643 /*----------------------------------------------------------------------------
4644 | Returns the result of subtracting the quadruple-precision floating-point
4645 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4646 | Standard for Binary Floating-Point Arithmetic.
4647 *----------------------------------------------------------------------------*/
4649 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4653 aSign = extractFloat128Sign( a );
4654 bSign = extractFloat128Sign( b );
4655 if ( aSign == bSign ) {
4656 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4659 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4664 /*----------------------------------------------------------------------------
4665 | Returns the result of multiplying the quadruple-precision floating-point
4666 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4667 | Standard for Binary Floating-Point Arithmetic.
4668 *----------------------------------------------------------------------------*/
4670 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4672 flag aSign, bSign, zSign;
4673 int32 aExp, bExp, zExp;
4674 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4677 aSig1 = extractFloat128Frac1( a );
4678 aSig0 = extractFloat128Frac0( a );
4679 aExp = extractFloat128Exp( a );
4680 aSign = extractFloat128Sign( a );
4681 bSig1 = extractFloat128Frac1( b );
4682 bSig0 = extractFloat128Frac0( b );
4683 bExp = extractFloat128Exp( b );
4684 bSign = extractFloat128Sign( b );
4685 zSign = aSign ^ bSign;
4686 if ( aExp == 0x7FFF ) {
4687 if ( ( aSig0 | aSig1 )
4688 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4689 return propagateFloat128NaN( a, b STATUS_VAR );
4691 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4692 return packFloat128( zSign, 0x7FFF, 0, 0 );
4694 if ( bExp == 0x7FFF ) {
4695 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4696 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4698 float_raise( float_flag_invalid STATUS_VAR);
4699 z.low = float128_default_nan_low;
4700 z.high = float128_default_nan_high;
4703 return packFloat128( zSign, 0x7FFF, 0, 0 );
4706 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4707 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4710 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4711 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4713 zExp = aExp + bExp - 0x4000;
4714 aSig0 |= LIT64( 0x0001000000000000 );
4715 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4716 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4717 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4718 zSig2 |= ( zSig3 != 0 );
4719 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4720 shift128ExtraRightJamming(
4721 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4724 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of dividing the quadruple-precision floating-point value
4730 | `a' by the corresponding value `b'. The operation is performed according to
4731 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4732 *----------------------------------------------------------------------------*/
4734 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4736 flag aSign, bSign, zSign;
4737 int32 aExp, bExp, zExp;
4738 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4739 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4742 aSig1 = extractFloat128Frac1( a );
4743 aSig0 = extractFloat128Frac0( a );
4744 aExp = extractFloat128Exp( a );
4745 aSign = extractFloat128Sign( a );
4746 bSig1 = extractFloat128Frac1( b );
4747 bSig0 = extractFloat128Frac0( b );
4748 bExp = extractFloat128Exp( b );
4749 bSign = extractFloat128Sign( b );
4750 zSign = aSign ^ bSign;
4751 if ( aExp == 0x7FFF ) {
4752 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4753 if ( bExp == 0x7FFF ) {
4754 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4757 return packFloat128( zSign, 0x7FFF, 0, 0 );
4759 if ( bExp == 0x7FFF ) {
4760 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4761 return packFloat128( zSign, 0, 0, 0 );
4764 if ( ( bSig0 | bSig1 ) == 0 ) {
4765 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4767 float_raise( float_flag_invalid STATUS_VAR);
4768 z.low = float128_default_nan_low;
4769 z.high = float128_default_nan_high;
4772 float_raise( float_flag_divbyzero STATUS_VAR);
4773 return packFloat128( zSign, 0x7FFF, 0, 0 );
4775 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4778 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4779 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4781 zExp = aExp - bExp + 0x3FFD;
4783 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4785 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4786 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4787 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4790 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4791 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4792 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4793 while ( (sbits64) rem0 < 0 ) {
4795 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4797 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4798 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4799 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4800 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4801 while ( (sbits64) rem1 < 0 ) {
4803 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4805 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4807 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4808 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4812 /*----------------------------------------------------------------------------
4813 | Returns the remainder of the quadruple-precision floating-point value `a'
4814 | with respect to the corresponding value `b'. The operation is performed
4815 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4818 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4820 flag aSign, bSign, zSign;
4821 int32 aExp, bExp, expDiff;
4822 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4823 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4827 aSig1 = extractFloat128Frac1( a );
4828 aSig0 = extractFloat128Frac0( a );
4829 aExp = extractFloat128Exp( a );
4830 aSign = extractFloat128Sign( a );
4831 bSig1 = extractFloat128Frac1( b );
4832 bSig0 = extractFloat128Frac0( b );
4833 bExp = extractFloat128Exp( b );
4834 bSign = extractFloat128Sign( b );
4835 if ( aExp == 0x7FFF ) {
4836 if ( ( aSig0 | aSig1 )
4837 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4838 return propagateFloat128NaN( a, b STATUS_VAR );
4842 if ( bExp == 0x7FFF ) {
4843 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4847 if ( ( bSig0 | bSig1 ) == 0 ) {
4849 float_raise( float_flag_invalid STATUS_VAR);
4850 z.low = float128_default_nan_low;
4851 z.high = float128_default_nan_high;
4854 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4858 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4860 expDiff = aExp - bExp;
4861 if ( expDiff < -1 ) return a;
4863 aSig0 | LIT64( 0x0001000000000000 ),
4865 15 - ( expDiff < 0 ),
4870 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4871 q = le128( bSig0, bSig1, aSig0, aSig1 );
4872 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4874 while ( 0 < expDiff ) {
4875 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4876 q = ( 4 < q ) ? q - 4 : 0;
4877 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4878 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4879 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4880 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4883 if ( -64 < expDiff ) {
4884 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4885 q = ( 4 < q ) ? q - 4 : 0;
4887 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4889 if ( expDiff < 0 ) {
4890 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4893 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4895 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4896 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4899 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4900 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4903 alternateASig0 = aSig0;
4904 alternateASig1 = aSig1;
4906 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4907 } while ( 0 <= (sbits64) aSig0 );
4909 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4910 if ( ( sigMean0 < 0 )
4911 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4912 aSig0 = alternateASig0;
4913 aSig1 = alternateASig1;
4915 zSign = ( (sbits64) aSig0 < 0 );
4916 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4918 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4922 /*----------------------------------------------------------------------------
4923 | Returns the square root of the quadruple-precision floating-point value `a'.
4924 | The operation is performed according to the IEC/IEEE Standard for Binary
4925 | Floating-Point Arithmetic.
4926 *----------------------------------------------------------------------------*/
4928 float128 float128_sqrt( float128 a STATUS_PARAM )
4932 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4933 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4936 aSig1 = extractFloat128Frac1( a );
4937 aSig0 = extractFloat128Frac0( a );
4938 aExp = extractFloat128Exp( a );
4939 aSign = extractFloat128Sign( a );
4940 if ( aExp == 0x7FFF ) {
4941 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4942 if ( ! aSign ) return a;
4946 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4948 float_raise( float_flag_invalid STATUS_VAR);
4949 z.low = float128_default_nan_low;
4950 z.high = float128_default_nan_high;
4954 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4955 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4957 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4958 aSig0 |= LIT64( 0x0001000000000000 );
4959 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4960 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4961 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4962 doubleZSig0 = zSig0<<1;
4963 mul64To128( zSig0, zSig0, &term0, &term1 );
4964 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4965 while ( (sbits64) rem0 < 0 ) {
4968 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4970 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4971 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4972 if ( zSig1 == 0 ) zSig1 = 1;
4973 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4974 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4975 mul64To128( zSig1, zSig1, &term2, &term3 );
4976 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4977 while ( (sbits64) rem1 < 0 ) {
4979 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4981 term2 |= doubleZSig0;
4982 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4984 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4986 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4987 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4991 /*----------------------------------------------------------------------------
4992 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4993 | the corresponding value `b', and 0 otherwise. The comparison is performed
4994 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4997 flag float128_eq( float128 a, float128 b STATUS_PARAM )
5000 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5001 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5002 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5003 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5005 if ( float128_is_signaling_nan( a )
5006 || float128_is_signaling_nan( b ) ) {
5007 float_raise( float_flag_invalid STATUS_VAR);
5013 && ( ( a.high == b.high )
5015 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5020 /*----------------------------------------------------------------------------
5021 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5022 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5025 *----------------------------------------------------------------------------*/
5027 flag float128_le( float128 a, float128 b STATUS_PARAM )
5031 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5032 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5033 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5034 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5036 float_raise( float_flag_invalid STATUS_VAR);
5039 aSign = extractFloat128Sign( a );
5040 bSign = extractFloat128Sign( b );
5041 if ( aSign != bSign ) {
5044 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5048 aSign ? le128( b.high, b.low, a.high, a.low )
5049 : le128( a.high, a.low, b.high, b.low );
5053 /*----------------------------------------------------------------------------
5054 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5055 | the corresponding value `b', and 0 otherwise. The comparison is performed
5056 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5057 *----------------------------------------------------------------------------*/
5059 flag float128_lt( float128 a, float128 b STATUS_PARAM )
5063 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5064 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5065 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5066 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5068 float_raise( float_flag_invalid STATUS_VAR);
5071 aSign = extractFloat128Sign( a );
5072 bSign = extractFloat128Sign( b );
5073 if ( aSign != bSign ) {
5076 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5080 aSign ? lt128( b.high, b.low, a.high, a.low )
5081 : lt128( a.high, a.low, b.high, b.low );
5085 /*----------------------------------------------------------------------------
5086 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5087 | the corresponding value `b', and 0 otherwise. The invalid exception is
5088 | raised if either operand is a NaN. Otherwise, the comparison is performed
5089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5092 flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5095 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5096 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5097 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5098 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5100 float_raise( float_flag_invalid STATUS_VAR);
5105 && ( ( a.high == b.high )
5107 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5112 /*----------------------------------------------------------------------------
5113 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5114 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5115 | cause an exception. Otherwise, the comparison is performed according to the
5116 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 *----------------------------------------------------------------------------*/
5119 flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5123 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5124 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5125 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5126 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5128 if ( float128_is_signaling_nan( a )
5129 || float128_is_signaling_nan( b ) ) {
5130 float_raise( float_flag_invalid STATUS_VAR);
5134 aSign = extractFloat128Sign( a );
5135 bSign = extractFloat128Sign( b );
5136 if ( aSign != bSign ) {
5139 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5143 aSign ? le128( b.high, b.low, a.high, a.low )
5144 : le128( a.high, a.low, b.high, b.low );
5148 /*----------------------------------------------------------------------------
5149 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5150 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5151 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5152 | Standard for Binary Floating-Point Arithmetic.
5153 *----------------------------------------------------------------------------*/
5155 flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5159 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5160 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5161 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5162 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5164 if ( float128_is_signaling_nan( a )
5165 || float128_is_signaling_nan( b ) ) {
5166 float_raise( float_flag_invalid STATUS_VAR);
5170 aSign = extractFloat128Sign( a );
5171 bSign = extractFloat128Sign( b );
5172 if ( aSign != bSign ) {
5175 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5179 aSign ? lt128( b.high, b.low, a.high, a.low )
5180 : lt128( a.high, a.low, b.high, b.low );