Skip to content

Commit 77429e2

Browse files
committed
Optimize performance of fmod with Barrett multiplication
1 parent bf85502 commit 77429e2

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed

libm/src/math/generic/fmod.rs

+93
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/* SPDX-License-Identifier: MIT OR Apache-2.0 */
22
use super::super::{CastFrom, Float, Int, MinInt};
3+
use crate::support::{DInt, HInt, Reducer};
34

45
#[inline]
56
pub fn fmod<F: Float>(x: F, y: F) -> F {
@@ -59,10 +60,102 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
5960

6061
/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
6162
fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
63+
// FIXME: This is a temporary hack to get around the lack of `u256 / u256`.
64+
// Actually, the algorithm only needs the operation `(x << I::BITS) / y`
65+
// where `x < y`. That is, a division `u256 / u128` where the quotient must
66+
// not overflow `u128` would be sufficient for `f128`.
67+
unsafe {
68+
use core::mem::transmute_copy;
69+
if I::BITS == 64 {
70+
let x = transmute_copy::<I, u64>(&x);
71+
let y = transmute_copy::<I, u64>(&y);
72+
let r = fast_reduction::<f64, u64>(x, e, y);
73+
return transmute_copy::<u64, I>(&r);
74+
}
75+
if I::BITS == 32 {
76+
let x = transmute_copy::<I, u32>(&x);
77+
let y = transmute_copy::<I, u32>(&y);
78+
let r = fast_reduction::<f32, u32>(x, e, y);
79+
return transmute_copy::<u32, I>(&r);
80+
}
81+
#[cfg(f16_enabled)]
82+
if I::BITS == 16 {
83+
let x = transmute_copy::<I, u16>(&x);
84+
let y = transmute_copy::<I, u16>(&y);
85+
let r = fast_reduction::<f16, u16>(x, e, y);
86+
return transmute_copy::<u16, I>(&r);
87+
}
88+
}
89+
6290
x %= y;
6391
for _ in 0..e {
6492
x <<= 1;
6593
x = x.checked_sub(y).unwrap_or(x);
6694
}
6795
x
6896
}
97+
98+
trait SafeShift: Float {
99+
// How many guaranteed leading zeros do the values have?
100+
// A normalized floating point mantissa has `EXP_BITS` guaranteed leading
101+
// zeros (exludes the implicit bit, but includes the now-zeroed sign bit)
102+
// `-1` because we want to shift by either `BASE_SHIFT` or `BASE_SHIFT + 1`
103+
const BASE_SHIFT: u32 = Self::EXP_BITS - 1;
104+
}
105+
impl<F: Float> SafeShift for F {}
106+
107+
fn fast_reduction<F, I>(x: I, e: u32, y: I) -> I
108+
where
109+
F: Float<Int = I>,
110+
I: Int + HInt,
111+
I::D: Int + DInt<H = I>,
112+
{
113+
let _0 = I::ZERO;
114+
let _1 = I::ONE;
115+
116+
if y == _1 {
117+
return _0;
118+
}
119+
120+
if e <= F::BASE_SHIFT {
121+
return (x << e) % y;
122+
}
123+
124+
// Find least depth s.t. `(e >> depth) < I::BITS`
125+
let depth = (I::BITS - 1)
126+
.leading_zeros()
127+
.saturating_sub(e.leading_zeros());
128+
129+
let initial = (e >> depth) - F::BASE_SHIFT;
130+
131+
let max_rem = y.wrapping_sub(_1);
132+
let max_ilog2 = max_rem.ilog2();
133+
let mut pow2 = _1 << max_ilog2.min(initial);
134+
for _ in max_ilog2..initial {
135+
pow2 <<= 1;
136+
pow2 = pow2.checked_sub(y).unwrap_or(pow2);
137+
}
138+
139+
// At each step `k in [depth, ..., 0]`,
140+
// `p` is `(e >> k) - BASE_SHIFT`
141+
// `m` is `(1 << p) % y`
142+
let mut k = depth;
143+
let mut p = initial;
144+
let mut m = Reducer::new(pow2, y);
145+
146+
while k > 0 {
147+
k -= 1;
148+
p = p + p + F::BASE_SHIFT;
149+
if e & (1 << k) != 0 {
150+
m = m.squared_with_shift(F::BASE_SHIFT + 1);
151+
p += 1;
152+
} else {
153+
m = m.squared_with_shift(F::BASE_SHIFT);
154+
};
155+
156+
debug_assert_eq!(p, (e >> k) - F::BASE_SHIFT);
157+
}
158+
159+
// (x << BASE_SHIFT) * (1 << p) == x << e
160+
m.mul_into_div_rem(x << F::BASE_SHIFT).1
161+
}

0 commit comments

Comments
 (0)