diff options
| author | sayantn <sayantan.chakraborty@students.iiserpune.ac.in> | 2024-06-30 01:42:36 +0530 |
|---|---|---|
| committer | Amanieu d'Antras <amanieu@gmail.com> | 2024-07-06 11:00:34 +0200 |
| commit | 1c3b3b80c00ebec8048806ca884938c03cfc4799 (patch) | |
| tree | 574f3e900f5aff2f92fbae28d15704bf875437ba /library/stdarch/crates | |
| parent | 1f3264848fc4dd2c6667744227e444f3ebdb147b (diff) | |
| download | rust-1c3b3b80c00ebec8048806ca884938c03cfc4799.tar.gz rust-1c3b3b80c00ebec8048806ca884938c03cfc4799.zip | |
Fix the stream intrinsics
They should use a platform-specific address management.
Diffstat (limited to 'library/stdarch/crates')
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/avx.rs | 12 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/avx2.rs | 4 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/avx512bw.rs | 2 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/avx512f.rs | 69 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/macros.rs | 30 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/sse.rs | 4 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/sse2.rs | 12 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/sse41.rs | 4 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86_64/sse2.rs | 4 |
9 files changed, 82 insertions, 59 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index aa5a5d8c184..7726a188f2b 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { crate::arch::asm!( - "vmovntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntdq", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); @@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { crate::arch::asm!( - "vmovntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntpd", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); @@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntps", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 0343416a921..fa32c7fcc47 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i { let dst: __m256i; crate::arch::asm!( - "vmovntdqa {a}, [{mem_addr}]", + vpl!("vmovntdqa {a}"), a = out(ymm_reg) dst, - mem_addr = in(reg) mem_addr, + p = in(reg) mem_addr, options(pure, readonly, nostack, preserves_flags), ); dst diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 1f786d01f0b..dd74d11786f 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -8,8 +8,6 @@ use crate::{ #[cfg(test)] use stdarch_test::assert_instr; -use super::avx512f::{vpl, vps}; - /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30) diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 3fe919abc80..cbda06b1ab5 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -6,37 +6,6 @@ use crate::{ mem, ptr, }; -// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full -// register name (e.g. rax). We have to explicitly override the placeholder to -// use the 32-bit register name in that case. - -#[cfg(target_pointer_width = "32")] -macro_rules! vpl { - ($inst:expr) => { - concat!($inst, ", [{p:e}]") - }; -} -#[cfg(target_pointer_width = "64")] -macro_rules! vpl { - ($inst:expr) => { - concat!($inst, ", [{p}]") - }; -} -#[cfg(target_pointer_width = "32")] -macro_rules! vps { - ($inst1:expr, $inst2:expr) => { - concat!($inst1, " [{p:e}]", $inst2) - }; -} -#[cfg(target_pointer_width = "64")] -macro_rules! vps { - ($inst1:expr, $inst2:expr) => { - concat!($inst1, " [{p}]", $inst2) - }; -} - -pub(crate) use {vpl, vps}; - #[cfg(test)] use stdarch_test::assert_instr; @@ -27899,8 +27868,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntps", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); @@ -27925,8 +27894,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { crate::arch::asm!( - "vmovntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntpd", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); @@ -27951,13 +27920,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) { crate::arch::asm!( - "vmovntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntdq", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); } +/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr +/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon) +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i { + let dst: __m512i; + crate::arch::asm!( + vpl!("vmovntdqa {a}"), + a = out(zmm_reg) dst, + p = in(reg) mem_addr, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + /// Sets packed 32-bit integers in `dst` with the supplied values. /// /// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931) @@ -54567,6 +54555,13 @@ mod tests { } #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_stream_load_si512() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _); + assert_eq_m512i(a, r); + } + + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_add_epi32() { let a = _mm512_set1_epi32(1); let e: i32 = _mm512_reduce_add_epi32(a); diff --git a/library/stdarch/crates/core_arch/src/x86/macros.rs b/library/stdarch/crates/core_arch/src/x86/macros.rs index 17d64f5bbfd..ddf38aa5063 100644 --- a/library/stdarch/crates/core_arch/src/x86/macros.rs +++ b/library/stdarch/crates/core_arch/src/x86/macros.rs @@ -57,3 +57,33 @@ macro_rules! assert_approx_eq { ); }}; } + +// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full +// register name (e.g. rax). We have to explicitly override the placeholder to +// use the 32-bit register name in that case. + +#[cfg(target_pointer_width = "32")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p:e}]") + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p}]") + }; +} + +#[cfg(target_pointer_width = "32")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p:e}]", $inst2) + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p}]", $inst2) + }; +} diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs index a4602301de1..ea6e685acbc 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse.rs @@ -1992,8 +1992,8 @@ extern "C" { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { crate::arch::asm!( - "movntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntps", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 289d41a0ffa..0dee597410e 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { crate::arch::asm!( - "movntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntdq", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); @@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { crate::arch::asm!( - "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value - mem_addr = in(reg) mem_addr, + vps!("movnti", ",{a:e}"), // `:e` for 32bit value + p = in(reg) mem_addr, a = in(reg) a, options(nostack, preserves_flags), ); @@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { crate::arch::asm!( - "movntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntpd", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs index c8b260bec8e..daf89bc3fd0 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse41.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs @@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i { let dst: __m128i; crate::arch::asm!( - "movntdqa {a}, [{mem_addr}]", + vpl!("movntdqa {a}"), a = out(xmm_reg) dst, - mem_addr = in(reg) mem_addr, + p = in(reg) mem_addr, options(pure, readonly, nostack, preserves_flags), ); dst diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs index e5069058cdb..8f85d4e2829 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs @@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { crate::arch::asm!( - "movnti [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + "movnti [{p}], {a}", + p = in(reg) mem_addr, a = in(reg) a, options(nostack, preserves_flags), ); |
