Skip to content

Commit fba7987

Browse files
committed
Use runtime feature detection for fma routines on x86
Get performance closer to the glibc implementations by adding assembly fma routines, with runtime feature detection so they are used even if not compiled with `+fma` (as the distributed standard library is often not). Glibc uses ifuncs, this implementation stores a function pointer in an atomic. Results of CPU flags are also cached in order to avoid repeating the startup time in calls to different functions. The feature detection code is a slightly simplified version of `std-detect`. Musl sources were used as a reference [1]. Fixes: rust-lang/rust#140452 once synced [1]: https://github.com/bminor/musl/blob/c47ad25ea3b484e10326f933e927c0bc8cded3da/src/math/x32/fma.c
1 parent 1b5c768 commit fba7987

File tree

8 files changed

+579
-3
lines changed

8 files changed

+579
-3
lines changed

etc/function-definitions.json

+2
Original file line numberDiff line numberDiff line change
@@ -343,13 +343,15 @@
343343
"fma": {
344344
"sources": [
345345
"libm/src/math/arch/aarch64.rs",
346+
"libm/src/math/arch/i686.rs",
346347
"libm/src/math/fma.rs"
347348
],
348349
"type": "f64"
349350
},
350351
"fmaf": {
351352
"sources": [
352353
"libm/src/math/arch/aarch64.rs",
354+
"libm/src/math/arch/i686.rs",
353355
"libm/src/math/fma.rs"
354356
],
355357
"type": "f32"

libm/src/math/arch/mod.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ cfg_if! {
1616
};
1717
} else if #[cfg(target_feature = "sse2")] {
1818
mod x86;
19-
pub use x86::{sqrt, sqrtf};
19+
20+
pub use x86::{sqrt, sqrtf, fma, fmaf};
2021
} else if #[cfg(all(
2122
any(target_arch = "aarch64", target_arch = "arm64ec"),
2223
target_feature = "neon"

libm/src/math/arch/x86.rs

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
//! Architecture-specific support for x86-32 and x86-64 with SSE2
22
3+
mod detect;
4+
mod fma;
5+
6+
pub use fma::{fma, fmaf};
7+
38
pub fn sqrtf(mut x: f32) -> f32 {
49
// SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
510
// access or side effects.

libm/src/math/arch/x86/detect.rs

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
#[allow(unused_imports)]
2+
use crate::support::{Flags, get_or_init_flags_cache};
3+
4+
/// CPU features that get cached (doesn't correlate to anything on the CPU).
5+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6+
pub mod cpu_flags {
7+
use crate::support::unique_masks;
8+
9+
unique_masks! {
10+
u32,
11+
SSE3,
12+
F16C,
13+
SSE,
14+
SSE2,
15+
ERMSB,
16+
MOVRS,
17+
FMA,
18+
FMA4,
19+
AVX512FP16,
20+
AVX512BF16,
21+
}
22+
}
23+
24+
/// Get CPU features, loading from a cache if available.
25+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
26+
pub fn get_cpu_features() -> Flags {
27+
use core::sync::atomic::AtomicU32;
28+
static CACHE: AtomicU32 = AtomicU32::new(0);
29+
get_or_init_flags_cache(&CACHE, load_x86_features)
30+
}
31+
32+
/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
33+
///
34+
/// Implementation is taken from [std-detect][std-detect].
35+
///
36+
/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
37+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
38+
fn load_x86_features() -> Flags {
39+
#[cfg(target_arch = "x86")]
40+
use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
41+
#[cfg(target_arch = "x86_64")]
42+
use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
43+
44+
let mut value = Flags::empty();
45+
46+
if cfg!(target_env = "sgx") {
47+
// doesn't support this because it is untrusted data
48+
return Flags::empty();
49+
}
50+
51+
// Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
52+
// has `cpuid` support.
53+
54+
// 0. EAX = 0: Basic Information:
55+
// - EAX returns the "Highest Function Parameter", that is, the maximum leaf
56+
// value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
57+
// - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
58+
// (in that order)
59+
let mut vendor_id = [0u8; 12];
60+
let max_basic_leaf;
61+
unsafe {
62+
let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
63+
max_basic_leaf = eax;
64+
vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
65+
vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
66+
vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
67+
}
68+
69+
if max_basic_leaf < 1 {
70+
// Earlier Intel 486, CPUID not implemented
71+
return value;
72+
}
73+
74+
// EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
75+
// Contains information about most x86 features.
76+
let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
77+
let proc_info_ecx = Flags::from_bits(ecx);
78+
let proc_info_edx = Flags::from_bits(edx);
79+
80+
// EAX = 7: Queries "Extended Features";
81+
// Contains information about bmi,bmi2, and avx2 support.
82+
let mut extended_features_ebx = Flags::empty();
83+
let mut extended_features_edx = Flags::empty();
84+
let mut extended_features_eax_leaf_1 = Flags::empty();
85+
if max_basic_leaf >= 7 {
86+
let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
87+
extended_features_ebx = Flags::from_bits(ebx);
88+
extended_features_edx = Flags::from_bits(edx);
89+
90+
let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
91+
extended_features_eax_leaf_1 = Flags::from_bits(eax)
92+
}
93+
94+
// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
95+
// - EAX returns the max leaf value for extended information, that is,
96+
// `cpuid` calls in range [0x8000_0000; u32::MAX]:
97+
let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
98+
99+
// EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
100+
let mut extended_proc_info_ecx = Flags::empty();
101+
if extended_max_basic_leaf >= 1 {
102+
let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
103+
extended_proc_info_ecx = Flags::from_bits(ecx);
104+
}
105+
106+
let mut enable = |regflags: Flags, regbit, flag| {
107+
if regflags.test_nth(regbit) {
108+
value.insert(flag);
109+
}
110+
};
111+
112+
enable(proc_info_ecx, 0, cpu_flags::SSE3);
113+
enable(proc_info_ecx, 29, cpu_flags::F16C);
114+
enable(proc_info_edx, 25, cpu_flags::SSE);
115+
enable(proc_info_edx, 26, cpu_flags::SSE2);
116+
enable(extended_features_ebx, 9, cpu_flags::ERMSB);
117+
enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
118+
119+
// `XSAVE` and `AVX` support:
120+
let cpu_xsave = proc_info_ecx.test_nth(26);
121+
if cpu_xsave {
122+
// 0. Here the CPU supports `XSAVE`.
123+
124+
// 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
125+
// supports saving the state of the AVX/AVX2 vector registers on
126+
// context-switches, see:
127+
//
128+
// - [intel: is avx enabled?][is_avx_enabled],
129+
// - [mozilla: sse.cpp][mozilla_sse_cpp].
130+
//
131+
// [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
132+
// [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
133+
let cpu_osxsave = proc_info_ecx.test_nth(27);
134+
135+
if cpu_osxsave {
136+
// 2. The OS must have signaled the CPU that it supports saving and
137+
// restoring the:
138+
//
139+
// * SSE -> `XCR0.SSE[1]`
140+
// * AVX -> `XCR0.AVX[2]`
141+
// * AVX-512 -> `XCR0.AVX-512[7:5]`.
142+
// * AMX -> `XCR0.AMX[18:17]`
143+
//
144+
// by setting the corresponding bits of `XCR0` to `1`.
145+
//
146+
// This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
147+
let xcr0 = unsafe { _xgetbv(0) };
148+
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
149+
let os_avx_support = xcr0 & 6 == 6;
150+
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
151+
let os_avx512_support = xcr0 & 0xe0 == 0xe0;
152+
153+
// Only if the OS and the CPU support saving/restoring the AVX
154+
// registers we enable `xsave` support:
155+
if os_avx_support {
156+
// See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
157+
// FEATURES" in the "Intel® 64 and IA-32 Architectures Software
158+
// Developer’s Manual, Volume 1: Basic Architecture":
159+
//
160+
// "Software enables the XSAVE feature set by setting
161+
// CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
162+
// instruction). If this bit is 0, execution of any of XGETBV,
163+
// XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
164+
// causes an invalid-opcode exception (#UD)"
165+
166+
// FMA (uses 256-bit wide registers):
167+
enable(proc_info_ecx, 12, cpu_flags::FMA);
168+
169+
// For AVX-512 the OS also needs to support saving/restoring
170+
// the extended state, only then we enable AVX-512 support:
171+
if os_avx512_support {
172+
enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
173+
enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
174+
}
175+
}
176+
}
177+
}
178+
179+
// As Hygon Dhyana originates from AMD technology and shares most of the architecture with
180+
// AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
181+
// (Family 18h).
182+
//
183+
// For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
184+
// family 17h.
185+
//
186+
// Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
187+
// (AMD64 Architecture Programmer's Manual, Appendix E).
188+
// Related Hygon kernel patch can be found on
189+
// http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
190+
if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
191+
// These features are available on AMD arch CPUs:
192+
enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
193+
}
194+
195+
value
196+
}
197+
198+
#[cfg(test)]
199+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
200+
mod tests {
201+
extern crate std;
202+
use std::is_x86_feature_detected;
203+
204+
use super::*;
205+
206+
#[test]
207+
fn check_matches_std() {
208+
let features = get_cpu_features();
209+
for i in 0..cpu_flags::ALL.len() {
210+
let flag = cpu_flags::ALL[i];
211+
let name = cpu_flags::NAMES[i];
212+
213+
let std_detected = match flag {
214+
cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
215+
cpu_flags::F16C => is_x86_feature_detected!("f16c"),
216+
cpu_flags::SSE => is_x86_feature_detected!("sse"),
217+
cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
218+
cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
219+
cpu_flags::MOVRS => continue, // only very recent support in std
220+
cpu_flags::FMA => is_x86_feature_detected!("fma"),
221+
cpu_flags::FMA4 => continue, // not yet supported in std
222+
cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
223+
cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
224+
_ => panic!("untested CPU flag {name}"),
225+
};
226+
227+
assert_eq!(
228+
std_detected,
229+
features.contains(flag),
230+
"different flag {name}. flags: {features:?}"
231+
);
232+
}
233+
}
234+
}

0 commit comments

Comments
 (0)