// Using runtime feature detection requires atomics. Currently there are no x86 targets // that support sse but not `AtomicPtr`. #[cfg(target_arch = "x86")] use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; use crate::support::feature_detect::{Flags, get_or_init_flags_cache, unique_masks}; /// CPU features that get cached (doesn't correlate to anything on the CPU). pub mod cpu_flags { use super::unique_masks; unique_masks! { u32, SSE3, F16C, SSE, SSE2, ERMSB, MOVRS, FMA, FMA4, AVX512FP16, AVX512BF16, } } /// Get CPU features, loading from a cache if available. pub fn get_cpu_features() -> Flags { use core::sync::atomic::AtomicU32; static CACHE: AtomicU32 = AtomicU32::new(0); get_or_init_flags_cache(&CACHE, load_x86_features) } /// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`. /// /// Implementation is taken from [std-detect][std-detect]. /// /// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142 fn load_x86_features() -> Flags { let mut value = Flags::empty(); if cfg!(target_env = "sgx") { // doesn't support this because it is untrusted data return Flags::empty(); } // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU // has `cpuid` support. // 0. EAX = 0: Basic Information: // - EAX returns the "Highest Function Parameter", that is, the maximum leaf // value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000]. // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX // (in that order) let mut vendor_id = [0u8; 12]; let max_basic_leaf; unsafe { let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0); max_basic_leaf = eax; vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes()); vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes()); vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes()); } if max_basic_leaf < 1 { // Earlier Intel 486, CPUID not implemented return value; } // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits"; // Contains information about most x86 features. let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) }; let proc_info_ecx = Flags::from_bits(ecx); let proc_info_edx = Flags::from_bits(edx); // EAX = 7: Queries "Extended Features"; // Contains information about bmi,bmi2, and avx2 support. let mut extended_features_ebx = Flags::empty(); let mut extended_features_edx = Flags::empty(); let mut extended_features_eax_leaf_1 = Flags::empty(); if max_basic_leaf >= 7 { let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) }; extended_features_ebx = Flags::from_bits(ebx); extended_features_edx = Flags::from_bits(edx); let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) }; extended_features_eax_leaf_1 = Flags::from_bits(eax) } // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported // - EAX returns the max leaf value for extended information, that is, // `cpuid` calls in range [0x8000_0000; u32::MAX]: let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax; // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits" let mut extended_proc_info_ecx = Flags::empty(); if extended_max_basic_leaf >= 1 { let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) }; extended_proc_info_ecx = Flags::from_bits(ecx); } let mut enable = |regflags: Flags, regbit, flag| { if regflags.test_nth(regbit) { value.insert(flag); } }; enable(proc_info_ecx, 0, cpu_flags::SSE3); enable(proc_info_ecx, 29, cpu_flags::F16C); enable(proc_info_edx, 25, cpu_flags::SSE); enable(proc_info_edx, 26, cpu_flags::SSE2); enable(extended_features_ebx, 9, cpu_flags::ERMSB); enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS); // `XSAVE` and `AVX` support: let cpu_xsave = proc_info_ecx.test_nth(26); if cpu_xsave { // 0. Here the CPU supports `XSAVE`. // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and // supports saving the state of the AVX/AVX2 vector registers on // context-switches, see: // // - [intel: is avx enabled?][is_avx_enabled], // - [mozilla: sse.cpp][mozilla_sse_cpp]. // // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190 let cpu_osxsave = proc_info_ecx.test_nth(27); if cpu_osxsave { // 2. The OS must have signaled the CPU that it supports saving and // restoring the: // // * SSE -> `XCR0.SSE[1]` // * AVX -> `XCR0.AVX[2]` // * AVX-512 -> `XCR0.AVX-512[7:5]`. // * AMX -> `XCR0.AMX[18:17]` // // by setting the corresponding bits of `XCR0` to `1`. // // This is safe because the CPU supports `xsave` and the OS has set `osxsave`. let xcr0 = unsafe { _xgetbv(0) }; // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`: let os_avx_support = xcr0 & 6 == 6; // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`: let os_avx512_support = xcr0 & 0xe0 == 0xe0; // Only if the OS and the CPU support saving/restoring the AVX // registers we enable `xsave` support: if os_avx_support { // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED // FEATURES" in the "Intel® 64 and IA-32 Architectures Software // Developer’s Manual, Volume 1: Basic Architecture": // // "Software enables the XSAVE feature set by setting // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4 // instruction). If this bit is 0, execution of any of XGETBV, // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV // causes an invalid-opcode exception (#UD)" // FMA (uses 256-bit wide registers): enable(proc_info_ecx, 12, cpu_flags::FMA); // For AVX-512 the OS also needs to support saving/restoring // the extended state, only then we enable AVX-512 support: if os_avx512_support { enable(extended_features_edx, 23, cpu_flags::AVX512FP16); enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16); } } } } // As Hygon Dhyana originates from AMD technology and shares most of the architecture with // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number // (Family 18h). // // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD // family 17h. // // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf // (AMD64 Architecture Programmer's Manual, Appendix E). // Related Hygon kernel patch can be found on // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" { // These features are available on AMD arch CPUs: enable(extended_proc_info_ecx, 16, cpu_flags::FMA4); } value } #[cfg(test)] mod tests { extern crate std; use std::is_x86_feature_detected; use super::*; #[test] fn check_matches_std() { let features = get_cpu_features(); for i in 0..cpu_flags::ALL.len() { let flag = cpu_flags::ALL[i]; let name = cpu_flags::NAMES[i]; let std_detected = match flag { cpu_flags::SSE3 => is_x86_feature_detected!("sse3"), cpu_flags::F16C => is_x86_feature_detected!("f16c"), cpu_flags::SSE => is_x86_feature_detected!("sse"), cpu_flags::SSE2 => is_x86_feature_detected!("sse2"), cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"), cpu_flags::MOVRS => continue, // only very recent support in std cpu_flags::FMA => is_x86_feature_detected!("fma"), cpu_flags::FMA4 => continue, // not yet supported in std cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"), cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"), _ => panic!("untested CPU flag {name}"), }; assert_eq!( std_detected, features.contains(flag), "different flag {name}. flags: {features:?}" ); } } }