-
Notifications
You must be signed in to change notification settings - Fork 13k
/
ieee.rs
2753 lines (2361 loc) · 98.4 KB
/
ieee.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use crate::{Category, ExpInt, IEK_INF, IEK_NAN, IEK_ZERO};
use crate::{Float, FloatConvert, ParseError, Round, Status, StatusAnd};
use core::cmp::{self, Ordering};
use core::convert::TryFrom;
use core::fmt::{self, Write};
use core::marker::PhantomData;
use core::mem;
use core::ops::Neg;
use smallvec::{smallvec, SmallVec};
#[must_use]
pub struct IeeeFloat<S> {
/// Absolute significand value (including the integer bit).
sig: [Limb; 1],
/// The signed unbiased exponent of the value.
exp: ExpInt,
/// What kind of floating point number this is.
category: Category,
/// Sign bit of the number.
sign: bool,
marker: PhantomData<S>,
}
/// Fundamental unit of big integer arithmetic, but also
/// large to store the largest significands by itself.
type Limb = u128;
const LIMB_BITS: usize = 128;
fn limbs_for_bits(bits: usize) -> usize {
(bits + LIMB_BITS - 1) / LIMB_BITS
}
/// Enum that represents what fraction of the LSB truncated bits of an fp number
/// represent.
///
/// This essentially combines the roles of guard and sticky bits.
#[must_use]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
enum Loss {
// Example of truncated bits:
ExactlyZero, // 000000
LessThanHalf, // 0xxxxx x's not all zero
ExactlyHalf, // 100000
MoreThanHalf, // 1xxxxx x's not all zero
}
/// Represents floating point arithmetic semantics.
pub trait Semantics: Sized {
/// Total number of bits in the in-memory format.
const BITS: usize;
/// Number of bits in the significand. This includes the integer bit.
const PRECISION: usize;
/// The largest E such that 2<sup>E</sup> is representable; this matches the
/// definition of IEEE 754.
const MAX_EXP: ExpInt;
/// The smallest E such that 2<sup>E</sup> is a normalized number; this
/// matches the definition of IEEE 754.
const MIN_EXP: ExpInt = -Self::MAX_EXP + 1;
/// The significand bit that marks NaN as quiet.
const QNAN_BIT: usize = Self::PRECISION - 2;
/// The significand bitpattern to mark a NaN as quiet.
/// NOTE: for X87DoubleExtended we need to set two bits instead of 2.
const QNAN_SIGNIFICAND: Limb = 1 << Self::QNAN_BIT;
fn from_bits(bits: u128) -> IeeeFloat<Self> {
assert!(Self::BITS > Self::PRECISION);
let sign = bits & (1 << (Self::BITS - 1));
let exponent = (bits & !sign) >> (Self::PRECISION - 1);
let mut r = IeeeFloat {
sig: [bits & ((1 << (Self::PRECISION - 1)) - 1)],
// Convert the exponent from its bias representation to a signed integer.
exp: (exponent as ExpInt) - Self::MAX_EXP,
category: Category::Zero,
sign: sign != 0,
marker: PhantomData,
};
if r.exp == Self::MIN_EXP - 1 && r.sig == [0] {
// Exponent, significand meaningless.
r.category = Category::Zero;
} else if r.exp == Self::MAX_EXP + 1 && r.sig == [0] {
// Exponent, significand meaningless.
r.category = Category::Infinity;
} else if r.exp == Self::MAX_EXP + 1 && r.sig != [0] {
// Sign, exponent, significand meaningless.
r.category = Category::NaN;
} else {
r.category = Category::Normal;
if r.exp == Self::MIN_EXP - 1 {
// Denormal.
r.exp = Self::MIN_EXP;
} else {
// Set integer bit.
sig::set_bit(&mut r.sig, Self::PRECISION - 1);
}
}
r
}
fn to_bits(x: IeeeFloat<Self>) -> u128 {
assert!(Self::BITS > Self::PRECISION);
// Split integer bit from significand.
let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1);
let mut significand = x.sig[0] & ((1 << (Self::PRECISION - 1)) - 1);
let exponent = match x.category {
Category::Normal => {
if x.exp == Self::MIN_EXP && !integer_bit {
// Denormal.
Self::MIN_EXP - 1
} else {
x.exp
}
}
Category::Zero => {
// FIXME(eddyb) Maybe we should guarantee an invariant instead?
significand = 0;
Self::MIN_EXP - 1
}
Category::Infinity => {
// FIXME(eddyb) Maybe we should guarantee an invariant instead?
significand = 0;
Self::MAX_EXP + 1
}
Category::NaN => Self::MAX_EXP + 1,
};
// Convert the exponent from a signed integer to its bias representation.
let exponent = (exponent + Self::MAX_EXP) as u128;
((x.sign as u128) << (Self::BITS - 1)) | (exponent << (Self::PRECISION - 1)) | significand
}
}
impl<S> Copy for IeeeFloat<S> {}
impl<S> Clone for IeeeFloat<S> {
fn clone(&self) -> Self {
*self
}
}
macro_rules! ieee_semantics {
($($name:ident = $sem:ident($bits:tt : $exp_bits:tt)),*) => {
$(pub struct $sem;)*
$(pub type $name = IeeeFloat<$sem>;)*
$(impl Semantics for $sem {
const BITS: usize = $bits;
const PRECISION: usize = ($bits - 1 - $exp_bits) + 1;
const MAX_EXP: ExpInt = (1 << ($exp_bits - 1)) - 1;
})*
}
}
ieee_semantics! {
Half = HalfS(16:5),
Single = SingleS(32:8),
Double = DoubleS(64:11),
Quad = QuadS(128:15)
}
pub struct X87DoubleExtendedS;
pub type X87DoubleExtended = IeeeFloat<X87DoubleExtendedS>;
impl Semantics for X87DoubleExtendedS {
const BITS: usize = 80;
const PRECISION: usize = 64;
const MAX_EXP: ExpInt = (1 << (15 - 1)) - 1;
/// For x87 extended precision, we want to make a NaN, not a
/// pseudo-NaN. Maybe we should expose the ability to make
/// pseudo-NaNs?
const QNAN_SIGNIFICAND: Limb = 0b11 << Self::QNAN_BIT;
/// Integer bit is explicit in this format. Intel hardware (387 and later)
/// does not support these bit patterns:
/// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
/// exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN")
/// exponent = 0, integer bit 1 ("pseudodenormal")
/// exponent != 0 nor all 1's, integer bit 0 ("unnormal")
/// At the moment, the first two are treated as NaNs, the second two as Normal.
fn from_bits(bits: u128) -> IeeeFloat<Self> {
let sign = bits & (1 << (Self::BITS - 1));
let exponent = (bits & !sign) >> Self::PRECISION;
let mut r = IeeeFloat {
sig: [bits & ((1 << (Self::PRECISION - 1)) - 1)],
// Convert the exponent from its bias representation to a signed integer.
exp: (exponent as ExpInt) - Self::MAX_EXP,
category: Category::Zero,
sign: sign != 0,
marker: PhantomData,
};
if r.exp == Self::MIN_EXP - 1 && r.sig == [0] {
// Exponent, significand meaningless.
r.category = Category::Zero;
} else if r.exp == Self::MAX_EXP + 1 && r.sig == [1 << (Self::PRECISION - 1)] {
// Exponent, significand meaningless.
r.category = Category::Infinity;
} else if r.exp == Self::MAX_EXP + 1 && r.sig != [1 << (Self::PRECISION - 1)] {
// Sign, exponent, significand meaningless.
r.category = Category::NaN;
} else {
r.category = Category::Normal;
if r.exp == Self::MIN_EXP - 1 {
// Denormal.
r.exp = Self::MIN_EXP;
}
}
r
}
fn to_bits(x: IeeeFloat<Self>) -> u128 {
// Get integer bit from significand.
let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1);
let mut significand = x.sig[0] & ((1 << Self::PRECISION) - 1);
let exponent = match x.category {
Category::Normal => {
if x.exp == Self::MIN_EXP && !integer_bit {
// Denormal.
Self::MIN_EXP - 1
} else {
x.exp
}
}
Category::Zero => {
// FIXME(eddyb) Maybe we should guarantee an invariant instead?
significand = 0;
Self::MIN_EXP - 1
}
Category::Infinity => {
// FIXME(eddyb) Maybe we should guarantee an invariant instead?
significand = 1 << (Self::PRECISION - 1);
Self::MAX_EXP + 1
}
Category::NaN => Self::MAX_EXP + 1,
};
// Convert the exponent from a signed integer to its bias representation.
let exponent = (exponent + Self::MAX_EXP) as u128;
((x.sign as u128) << (Self::BITS - 1)) | (exponent << Self::PRECISION) | significand
}
}
float_common_impls!(IeeeFloat<S>);
impl<S: Semantics> PartialEq for IeeeFloat<S> {
fn eq(&self, rhs: &Self) -> bool {
self.partial_cmp(rhs) == Some(Ordering::Equal)
}
}
impl<S: Semantics> PartialOrd for IeeeFloat<S> {
fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
match (self.category, rhs.category) {
(Category::NaN, _) | (_, Category::NaN) => None,
(Category::Infinity, Category::Infinity) => Some((!self.sign).cmp(&(!rhs.sign))),
(Category::Zero, Category::Zero) => Some(Ordering::Equal),
(Category::Infinity, _) | (Category::Normal, Category::Zero) => {
Some((!self.sign).cmp(&self.sign))
}
(_, Category::Infinity) | (Category::Zero, Category::Normal) => {
Some(rhs.sign.cmp(&(!rhs.sign)))
}
(Category::Normal, Category::Normal) => {
// Two normal numbers. Do they have the same sign?
Some((!self.sign).cmp(&(!rhs.sign)).then_with(|| {
// Compare absolute values; invert result if negative.
let result = self.cmp_abs_normal(*rhs);
if self.sign { result.reverse() } else { result }
}))
}
}
}
}
impl<S> Neg for IeeeFloat<S> {
type Output = Self;
fn neg(mut self) -> Self {
self.sign = !self.sign;
self
}
}
/// Prints this value as a decimal string.
///
/// \param precision The maximum number of digits of
/// precision to output. If there are fewer digits available,
/// zero padding will not be used unless the value is
/// integral and small enough to be expressed in
/// precision digits. 0 means to use the natural
/// precision of the number.
/// \param width The maximum number of zeros to
/// consider inserting before falling back to scientific
/// notation. 0 means to always use scientific notation.
///
/// \param alternate Indicate whether to remove the trailing zero in
/// fraction part or not. Also setting this parameter to true forces
/// producing of output more similar to default printf behavior.
/// Specifically the lower e is used as exponent delimiter and exponent
/// always contains no less than two digits.
///
/// Number precision width Result
/// ------ --------- ----- ------
/// 1.01E+4 5 2 10100
/// 1.01E+4 4 2 1.01E+4
/// 1.01E+4 5 1 1.01E+4
/// 1.01E-2 5 2 0.0101
/// 1.01E-2 4 2 0.0101
/// 1.01E-2 4 1 1.01E-2
impl<S: Semantics> fmt::Display for IeeeFloat<S> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let width = f.width().unwrap_or(3);
let alternate = f.alternate();
match self.category {
Category::Infinity => {
if self.sign {
return f.write_str("-Inf");
} else {
return f.write_str("+Inf");
}
}
Category::NaN => return f.write_str("NaN"),
Category::Zero => {
if self.sign {
f.write_char('-')?;
}
if width == 0 {
if alternate {
f.write_str("0.0")?;
if let Some(n) = f.precision() {
for _ in 1..n {
f.write_char('0')?;
}
}
f.write_str("e+00")?;
} else {
f.write_str("0.0E+0")?;
}
} else {
f.write_char('0')?;
}
return Ok(());
}
Category::Normal => {}
}
if self.sign {
f.write_char('-')?;
}
// We use enough digits so the number can be round-tripped back to an
// APFloat. The formula comes from "How to Print Floating-Point Numbers
// Accurately" by Steele and White.
// FIXME: Using a formula based purely on the precision is conservative;
// we can print fewer digits depending on the actual value being printed.
// precision = 2 + floor(S::PRECISION / lg_2(10))
let precision = f.precision().unwrap_or(2 + S::PRECISION * 59 / 196);
// Decompose the number into an APInt and an exponent.
let mut exp = self.exp - (S::PRECISION as ExpInt - 1);
let mut sig = vec![self.sig[0]];
// Ignore trailing binary zeros.
let trailing_zeros = sig[0].trailing_zeros();
let _: Loss = sig::shift_right(&mut sig, &mut exp, trailing_zeros as usize);
// Change the exponent from 2^e to 10^e.
if exp == 0 {
// Nothing to do.
} else if exp > 0 {
// Just shift left.
let shift = exp as usize;
sig.resize(limbs_for_bits(S::PRECISION + shift), 0);
sig::shift_left(&mut sig, &mut exp, shift);
} else {
// exp < 0
let mut texp = -exp as usize;
// We transform this using the identity:
// (N)(2^-e) == (N)(5^e)(10^-e)
// Multiply significand by 5^e.
// N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
let mut sig_scratch = vec![];
let mut p5 = vec![];
let mut p5_scratch = vec![];
while texp != 0 {
if p5.is_empty() {
p5.push(5);
} else {
p5_scratch.resize(p5.len() * 2, 0);
let _: Loss =
sig::mul(&mut p5_scratch, &mut 0, &p5, &p5, p5.len() * 2 * LIMB_BITS);
while p5_scratch.last() == Some(&0) {
p5_scratch.pop();
}
mem::swap(&mut p5, &mut p5_scratch);
}
if texp & 1 != 0 {
sig_scratch.resize(sig.len() + p5.len(), 0);
let _: Loss = sig::mul(
&mut sig_scratch,
&mut 0,
&sig,
&p5,
(sig.len() + p5.len()) * LIMB_BITS,
);
while sig_scratch.last() == Some(&0) {
sig_scratch.pop();
}
mem::swap(&mut sig, &mut sig_scratch);
}
texp >>= 1;
}
}
// Fill the buffer.
let mut buffer = vec![];
// Ignore digits from the significand until it is no more
// precise than is required for the desired precision.
// 196/59 is a very slight overestimate of lg_2(10).
let required = (precision * 196 + 58) / 59;
let mut discard_digits = sig::omsb(&sig).saturating_sub(required) * 59 / 196;
let mut in_trail = true;
while !sig.is_empty() {
// Perform short division by 10 to extract the rightmost digit.
// rem <- sig % 10
// sig <- sig / 10
let mut rem = 0;
// Use 64-bit division and remainder, with 32-bit chunks from sig.
sig::each_chunk(&mut sig, 32, |chunk| {
let chunk = chunk as u32;
let combined = ((rem as u64) << 32) | (chunk as u64);
rem = (combined % 10) as u8;
(combined / 10) as u32 as Limb
});
// Reduce the sigificand to avoid wasting time dividing 0's.
while sig.last() == Some(&0) {
sig.pop();
}
let digit = rem;
// Ignore digits we don't need.
if discard_digits > 0 {
discard_digits -= 1;
exp += 1;
continue;
}
// Drop trailing zeros.
if in_trail && digit == 0 {
exp += 1;
} else {
in_trail = false;
buffer.push(b'0' + digit);
}
}
assert!(!buffer.is_empty(), "no characters in buffer!");
// Drop down to precision.
// FIXME: don't do more precise calculations above than are required.
if buffer.len() > precision {
// The most significant figures are the last ones in the buffer.
let mut first_sig = buffer.len() - precision;
// Round.
// FIXME: this probably shouldn't use 'round half up'.
// Rounding down is just a truncation, except we also want to drop
// trailing zeros from the new result.
if buffer[first_sig - 1] < b'5' {
while first_sig < buffer.len() && buffer[first_sig] == b'0' {
first_sig += 1;
}
} else {
// Rounding up requires a decimal add-with-carry. If we continue
// the carry, the newly-introduced zeros will just be truncated.
for x in &mut buffer[first_sig..] {
if *x == b'9' {
first_sig += 1;
} else {
*x += 1;
break;
}
}
}
exp += first_sig as ExpInt;
buffer.drain(..first_sig);
// If we carried through, we have exactly one digit of precision.
if buffer.is_empty() {
buffer.push(b'1');
}
}
let digits = buffer.len();
// Check whether we should use scientific notation.
let scientific = if width == 0 {
true
} else if exp >= 0 {
// 765e3 --> 765000
// ^^^
// But we shouldn't make the number look more precise than it is.
exp as usize > width || digits + exp as usize > precision
} else {
// Power of the most significant digit.
let msd = exp + (digits - 1) as ExpInt;
if msd >= 0 {
// 765e-2 == 7.65
false
} else {
// 765e-5 == 0.00765
// ^ ^^
-msd as usize > width
}
};
// Scientific formatting is pretty straightforward.
if scientific {
exp += digits as ExpInt - 1;
f.write_char(buffer[digits - 1] as char)?;
f.write_char('.')?;
let truncate_zero = !alternate;
if digits == 1 && truncate_zero {
f.write_char('0')?;
} else {
for &d in buffer[..digits - 1].iter().rev() {
f.write_char(d as char)?;
}
}
// Fill with zeros up to precision.
if !truncate_zero && precision > digits - 1 {
for _ in 0..=precision - digits {
f.write_char('0')?;
}
}
// For alternate we use lower 'e'.
f.write_char(if alternate { 'e' } else { 'E' })?;
// Exponent always at least two digits if we do not truncate zeros.
if truncate_zero {
write!(f, "{:+}", exp)?;
} else {
write!(f, "{:+03}", exp)?;
}
return Ok(());
}
// Non-scientific, positive exponents.
if exp >= 0 {
for &d in buffer.iter().rev() {
f.write_char(d as char)?;
}
for _ in 0..exp {
f.write_char('0')?;
}
return Ok(());
}
// Non-scientific, negative exponents.
let unit_place = -exp as usize;
if unit_place < digits {
for &d in buffer[unit_place..].iter().rev() {
f.write_char(d as char)?;
}
f.write_char('.')?;
for &d in buffer[..unit_place].iter().rev() {
f.write_char(d as char)?;
}
} else {
f.write_str("0.")?;
for _ in digits..unit_place {
f.write_char('0')?;
}
for &d in buffer.iter().rev() {
f.write_char(d as char)?;
}
}
Ok(())
}
}
impl<S: Semantics> fmt::Debug for IeeeFloat<S> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}({:?} | {}{:?} * 2^{})",
self,
self.category,
if self.sign { "-" } else { "+" },
self.sig,
self.exp
)
}
}
impl<S: Semantics> Float for IeeeFloat<S> {
const BITS: usize = S::BITS;
const PRECISION: usize = S::PRECISION;
const MAX_EXP: ExpInt = S::MAX_EXP;
const MIN_EXP: ExpInt = S::MIN_EXP;
const ZERO: Self = IeeeFloat {
sig: [0],
exp: S::MIN_EXP - 1,
category: Category::Zero,
sign: false,
marker: PhantomData,
};
const INFINITY: Self = IeeeFloat {
sig: [0],
exp: S::MAX_EXP + 1,
category: Category::Infinity,
sign: false,
marker: PhantomData,
};
// FIXME(eddyb) remove when qnan becomes const fn.
const NAN: Self = IeeeFloat {
sig: [S::QNAN_SIGNIFICAND],
exp: S::MAX_EXP + 1,
category: Category::NaN,
sign: false,
marker: PhantomData,
};
fn qnan(payload: Option<u128>) -> Self {
IeeeFloat {
sig: [S::QNAN_SIGNIFICAND
| payload.map_or(0, |payload| {
// Zero out the excess bits of the significand.
payload & ((1 << S::QNAN_BIT) - 1)
})],
exp: S::MAX_EXP + 1,
category: Category::NaN,
sign: false,
marker: PhantomData,
}
}
fn snan(payload: Option<u128>) -> Self {
let mut snan = Self::qnan(payload);
// We always have to clear the QNaN bit to make it an SNaN.
sig::clear_bit(&mut snan.sig, S::QNAN_BIT);
// If there are no bits set in the payload, we have to set
// *something* to make it a NaN instead of an infinity;
// conventionally, this is the next bit down from the QNaN bit.
if snan.sig[0] & !S::QNAN_SIGNIFICAND == 0 {
sig::set_bit(&mut snan.sig, S::QNAN_BIT - 1);
}
snan
}
fn largest() -> Self {
// We want (in interchange format):
// exponent = 1..10
// significand = 1..1
IeeeFloat {
sig: [(1 << S::PRECISION) - 1],
exp: S::MAX_EXP,
category: Category::Normal,
sign: false,
marker: PhantomData,
}
}
// We want (in interchange format):
// exponent = 0..0
// significand = 0..01
const SMALLEST: Self = IeeeFloat {
sig: [1],
exp: S::MIN_EXP,
category: Category::Normal,
sign: false,
marker: PhantomData,
};
fn smallest_normalized() -> Self {
// We want (in interchange format):
// exponent = 0..0
// significand = 10..0
IeeeFloat {
sig: [1 << (S::PRECISION - 1)],
exp: S::MIN_EXP,
category: Category::Normal,
sign: false,
marker: PhantomData,
}
}
fn add_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
let status = match (self.category, rhs.category) {
(Category::Infinity, Category::Infinity) => {
// Differently signed infinities can only be validly
// subtracted.
if self.sign != rhs.sign {
self = Self::NAN;
Status::INVALID_OP
} else {
Status::OK
}
}
// Sign may depend on rounding mode; handled below.
(_, Category::Zero) | (Category::NaN, _) | (Category::Infinity, Category::Normal) => {
Status::OK
}
(Category::Zero, _) | (_, Category::NaN) | (_, Category::Infinity) => {
self = rhs;
Status::OK
}
// This return code means it was not a simple case.
(Category::Normal, Category::Normal) => {
let loss = sig::add_or_sub(
&mut self.sig,
&mut self.exp,
&mut self.sign,
&mut [rhs.sig[0]],
rhs.exp,
rhs.sign,
);
let status;
self = unpack!(status=, self.normalize(round, loss));
// Can only be zero if we lost no fraction.
assert!(self.category != Category::Zero || loss == Loss::ExactlyZero);
status
}
};
// If two numbers add (exactly) to zero, IEEE 754 decrees it is a
// positive zero unless rounding to minus infinity, except that
// adding two like-signed zeroes gives that zero.
if self.category == Category::Zero
&& (rhs.category != Category::Zero || self.sign != rhs.sign)
{
self.sign = round == Round::TowardNegative;
}
status.and(self)
}
fn mul_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
self.sign ^= rhs.sign;
match (self.category, rhs.category) {
(Category::NaN, _) => {
self.sign = false;
Status::OK.and(self)
}
(_, Category::NaN) => {
self.sign = false;
self.category = Category::NaN;
self.sig = rhs.sig;
Status::OK.and(self)
}
(Category::Zero, Category::Infinity) | (Category::Infinity, Category::Zero) => {
Status::INVALID_OP.and(Self::NAN)
}
(_, Category::Infinity) | (Category::Infinity, _) => {
self.category = Category::Infinity;
Status::OK.and(self)
}
(Category::Zero, _) | (_, Category::Zero) => {
self.category = Category::Zero;
Status::OK.and(self)
}
(Category::Normal, Category::Normal) => {
self.exp += rhs.exp;
let mut wide_sig = [0; 2];
let loss =
sig::mul(&mut wide_sig, &mut self.exp, &self.sig, &rhs.sig, S::PRECISION);
self.sig = [wide_sig[0]];
let mut status;
self = unpack!(status=, self.normalize(round, loss));
if loss != Loss::ExactlyZero {
status |= Status::INEXACT;
}
status.and(self)
}
}
}
fn mul_add_r(mut self, multiplicand: Self, addend: Self, round: Round) -> StatusAnd<Self> {
// If and only if all arguments are normal do we need to do an
// extended-precision calculation.
if !self.is_finite_non_zero() || !multiplicand.is_finite_non_zero() || !addend.is_finite() {
let mut status;
self = unpack!(status=, self.mul_r(multiplicand, round));
// FS can only be Status::OK or Status::INVALID_OP. There is no more work
// to do in the latter case. The IEEE-754R standard says it is
// implementation-defined in this case whether, if ADDEND is a
// quiet NaN, we raise invalid op; this implementation does so.
//
// If we need to do the addition we can do so with normal
// precision.
if status == Status::OK {
self = unpack!(status=, self.add_r(addend, round));
}
return status.and(self);
}
// Post-multiplication sign, before addition.
self.sign ^= multiplicand.sign;
// Allocate space for twice as many bits as the original significand, plus one
// extra bit for the addition to overflow into.
assert!(limbs_for_bits(S::PRECISION * 2 + 1) <= 2);
let mut wide_sig = sig::widening_mul(self.sig[0], multiplicand.sig[0]);
let mut loss = Loss::ExactlyZero;
let mut omsb = sig::omsb(&wide_sig);
self.exp += multiplicand.exp;
// Assume the operands involved in the multiplication are single-precision
// FP, and the two multiplicants are:
// lhs = a23 . a22 ... a0 * 2^e1
// rhs = b23 . b22 ... b0 * 2^e2
// the result of multiplication is:
// lhs = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
// Note that there are three significant bits at the left-hand side of the
// radix point: two for the multiplication, and an overflow bit for the
// addition (that will always be zero at this point). Move the radix point
// toward left by two bits, and adjust exponent accordingly.
self.exp += 2;
if addend.is_non_zero() {
// Normalize our MSB to one below the top bit to allow for overflow.
let ext_precision = 2 * S::PRECISION + 1;
if omsb != ext_precision - 1 {
assert!(ext_precision > omsb);
sig::shift_left(&mut wide_sig, &mut self.exp, (ext_precision - 1) - omsb);
}
// The intermediate result of the multiplication has "2 * S::PRECISION"
// significant bit; adjust the addend to be consistent with mul result.
let mut ext_addend_sig = [addend.sig[0], 0];
// Extend the addend significand to ext_precision - 1. This guarantees
// that the high bit of the significand is zero (same as wide_sig),
// so the addition will overflow (if it does overflow at all) into the top bit.
sig::shift_left(&mut ext_addend_sig, &mut 0, ext_precision - 1 - S::PRECISION);
loss = sig::add_or_sub(
&mut wide_sig,
&mut self.exp,
&mut self.sign,
&mut ext_addend_sig,
addend.exp + 1,
addend.sign,
);
omsb = sig::omsb(&wide_sig);
}
// Convert the result having "2 * S::PRECISION" significant-bits back to the one
// having "S::PRECISION" significant-bits. First, move the radix point from
// position "2*S::PRECISION - 1" to "S::PRECISION - 1". The exponent need to be
// adjusted by "2*S::PRECISION - 1" - "S::PRECISION - 1" = "S::PRECISION".
self.exp -= S::PRECISION as ExpInt + 1;
// In case MSB resides at the left-hand side of radix point, shift the
// mantissa right by some amount to make sure the MSB reside right before
// the radix point (i.e., "MSB . rest-significant-bits").
if omsb > S::PRECISION {
let bits = omsb - S::PRECISION;
loss = sig::shift_right(&mut wide_sig, &mut self.exp, bits).combine(loss);
}
self.sig[0] = wide_sig[0];
let mut status;
self = unpack!(status=, self.normalize(round, loss));
if loss != Loss::ExactlyZero {
status |= Status::INEXACT;
}
// If two numbers add (exactly) to zero, IEEE 754 decrees it is a
// positive zero unless rounding to minus infinity, except that
// adding two like-signed zeroes gives that zero.
if self.category == Category::Zero
&& !status.intersects(Status::UNDERFLOW)
&& self.sign != addend.sign
{
self.sign = round == Round::TowardNegative;
}
status.and(self)
}
fn div_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
self.sign ^= rhs.sign;
match (self.category, rhs.category) {
(Category::NaN, _) => {
self.sign = false;
Status::OK.and(self)
}
(_, Category::NaN) => {
self.category = Category::NaN;
self.sig = rhs.sig;
self.sign = false;
Status::OK.and(self)
}
(Category::Infinity, Category::Infinity) | (Category::Zero, Category::Zero) => {
Status::INVALID_OP.and(Self::NAN)
}
(Category::Infinity, _) | (Category::Zero, _) => Status::OK.and(self),
(Category::Normal, Category::Infinity) => {
self.category = Category::Zero;
Status::OK.and(self)
}
(Category::Normal, Category::Zero) => {
self.category = Category::Infinity;
Status::DIV_BY_ZERO.and(self)
}
(Category::Normal, Category::Normal) => {
self.exp -= rhs.exp;
let dividend = self.sig[0];
let loss = sig::div(
&mut self.sig,
&mut self.exp,
&mut [dividend],
&mut [rhs.sig[0]],
S::PRECISION,
);
let mut status;
self = unpack!(status=, self.normalize(round, loss));
if loss != Loss::ExactlyZero {
status |= Status::INEXACT;
}
status.and(self)
}
}
}
fn c_fmod(mut self, rhs: Self) -> StatusAnd<Self> {
match (self.category, rhs.category) {
(Category::NaN, _)
| (Category::Zero, Category::Infinity)
| (Category::Zero, Category::Normal)
| (Category::Normal, Category::Infinity) => Status::OK.and(self),
(_, Category::NaN) => {
self.sign = false;
self.category = Category::NaN;
self.sig = rhs.sig;
Status::OK.and(self)