diff options
| author | Tobias Decking <Tobias.Decking@gmail.com> | 2024-06-21 22:59:24 +0200 |
|---|---|---|
| committer | Amanieu d'Antras <amanieu@gmail.com> | 2024-06-30 15:47:18 +0200 |
| commit | fcee4d8b16ba2800e59f53e4f43469329f7d005b (patch) | |
| tree | 3bcf4ea711a9c75c22ee7a2d06934b186caa039b /library/stdarch/crates | |
| parent | a56cc86a23076d7e9fd49210ccb36dafb18c5dbb (diff) | |
| download | rust-fcee4d8b16ba2800e59f53e4f43469329f7d005b.tar.gz rust-fcee4d8b16ba2800e59f53e4f43469329f7d005b.zip | |
Define remaining IFMA intrinsics
Diffstat (limited to 'library/stdarch/crates')
| -rw-r--r-- | library/stdarch/crates/core_arch/missing-x86.md | 22 | ||||
| -rw-r--r-- | library/stdarch/crates/core_arch/src/x86/avx512ifma.rs | 455 |
2 files changed, 429 insertions, 48 deletions
diff --git a/library/stdarch/crates/core_arch/missing-x86.md b/library/stdarch/crates/core_arch/missing-x86.md index 11ad3f04c14..4c70c1b435c 100644 --- a/library/stdarch/crates/core_arch/missing-x86.md +++ b/library/stdarch/crates/core_arch/missing-x86.md @@ -219,28 +219,6 @@ </p></details> -<details><summary>["AVX512IFMA52"]</summary><p> - - * [ ] [`_mm512_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_madd52hi_epu64) - * [ ] [`_mm512_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_madd52lo_epu64) - * [ ] [`_mm512_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_madd52hi_epu64) - * [ ] [`_mm512_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_madd52lo_epu64) -</p></details> - - -<details><summary>["AVX512IFMA52", "AVX512VL"]</summary><p> - - * [ ] [`_mm256_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_madd52hi_epu64) - * [ ] [`_mm256_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_madd52lo_epu64) - * [ ] [`_mm256_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_madd52hi_epu64) - * [ ] [`_mm256_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_madd52lo_epu64) - * [ ] [`_mm_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_madd52hi_epu64) - * [ ] [`_mm_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_madd52lo_epu64) - * [ ] [`_mm_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_madd52hi_epu64) - * [ ] [`_mm_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_madd52lo_epu64) -</p></details> - - <details><summary>["AVX512_BF16", "AVX512F"]</summary><p> * [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps) diff --git a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs index 332d2316d5f..01bb704ae73 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs @@ -1,4 +1,5 @@ use crate::core_arch::x86::*; +use crate::intrinsics::simd::simd_select_bitmask; #[cfg(test)] use stdarch_test::assert_instr; @@ -9,7 +10,7 @@ use stdarch_test::assert_instr; /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64) #[inline] #[target_feature(enable = "avx512ifma")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -19,12 +20,52 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51 } /// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64) +#[target_feature(enable = "avx512ifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm512_mask_madd52hi_epu64( + a: __m512i, + k: __mmask8, + b: __m512i, + c: __m512i, +) -> __m512i { + simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64) +#[target_feature(enable = "avx512ifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm512_maskz_madd52hi_epu64( + k: __mmask8, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { + simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64) #[inline] #[target_feature(enable = "avx512ifma")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -34,12 +75,52 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51 } /// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64) +#[target_feature(enable = "avx512ifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm512_mask_madd52lo_epu64( + a: __m512i, + k: __mmask8, + b: __m512i, + c: __m512i, +) -> __m512i { + simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64) +#[target_feature(enable = "avx512ifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm512_maskz_madd52lo_epu64( + k: __mmask8, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { + simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -49,12 +130,52 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25 } /// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm256_mask_madd52hi_epu64( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm256_maskz_madd52hi_epu64( + k: __mmask8, + a: __m256i, + b: __m256i, + c: __m256i, +) -> __m256i { + simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -64,12 +185,52 @@ pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25 } /// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm256_mask_madd52lo_epu64( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm256_maskz_madd52lo_epu64( + k: __mmask8, + a: __m256i, + b: __m256i, + c: __m256i, +) -> __m256i { + simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -79,12 +240,42 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i } /// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { + simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the /// corresponding unsigned 64-bit integer in `a`, and store the /// results in `dst`. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -93,6 +284,36 @@ pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i vpmadd52luq_128(a, b, c) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are copied +/// from `k` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { + simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst` using writemask `k` (elements are zeroed +/// out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64) +#[target_feature(enable = "avx512ifma,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"] @@ -116,87 +337,269 @@ mod tests { use crate::core_arch::x86::*; + const K: __mmask8 = 0b01101101; + #[simd_test(enable = "avx512ifma")] unsafe fn test_mm512_madd52hi_epu64() { - let mut a = _mm512_set1_epi64(10 << 40); + let a = _mm512_set1_epi64(10 << 40); let b = _mm512_set1_epi64((11 << 40) + 4); let c = _mm512_set1_epi64((12 << 40) + 3); - a = _mm512_madd52hi_epu64(a, b, c); + let actual = _mm512_madd52hi_epu64(a, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) let expected = _mm512_set1_epi64(11030549757952); - assert_eq_m512i(a, expected); + assert_eq_m512i(expected, actual); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_mask_madd52hi_epu64() { + let a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + let actual = _mm512_mask_madd52hi_epu64(a, K, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let mut expected = _mm512_set1_epi64(11030549757952); + expected = _mm512_mask_blend_epi64(K, a, expected); + + assert_eq_m512i(expected, actual); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_maskz_madd52hi_epu64() { + let a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let mut expected = _mm512_set1_epi64(11030549757952); + expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); + + assert_eq_m512i(expected, actual); } #[simd_test(enable = "avx512ifma")] unsafe fn test_mm512_madd52lo_epu64() { - let mut a = _mm512_set1_epi64(10 << 40); + let a = _mm512_set1_epi64(10 << 40); let b = _mm512_set1_epi64((11 << 40) + 4); let c = _mm512_set1_epi64((12 << 40) + 3); - a = _mm512_madd52lo_epu64(a, b, c); + let actual = _mm512_madd52lo_epu64(a, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) let expected = _mm512_set1_epi64(100055558127628); - assert_eq_m512i(a, expected); + assert_eq_m512i(expected, actual); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_mask_madd52lo_epu64() { + let a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + let actual = _mm512_mask_madd52lo_epu64(a, K, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm512_set1_epi64(100055558127628); + expected = _mm512_mask_blend_epi64(K, a, expected); + + assert_eq_m512i(expected, actual); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_maskz_madd52lo_epu64() { + let a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm512_set1_epi64(100055558127628); + expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); + + assert_eq_m512i(expected, actual); } #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52hi_epu64() { - let mut a = _mm256_set1_epi64x(10 << 40); + let a = _mm256_set1_epi64x(10 << 40); let b = _mm256_set1_epi64x((11 << 40) + 4); let c = _mm256_set1_epi64x((12 << 40) + 3); - a = _mm256_madd52hi_epu64(a, b, c); + let actual = _mm256_madd52hi_epu64(a, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) let expected = _mm256_set1_epi64x(11030549757952); - assert_eq_m256i(a, expected); + assert_eq_m256i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_mask_madd52hi_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_mask_madd52hi_epu64(a, K, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let mut expected = _mm256_set1_epi64x(11030549757952); + expected = _mm256_mask_blend_epi64(K, a, expected); + + assert_eq_m256i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_maskz_madd52hi_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let mut expected = _mm256_set1_epi64x(11030549757952); + expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); + + assert_eq_m256i(expected, actual); } #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52lo_epu64() { - let mut a = _mm256_set1_epi64x(10 << 40); + let a = _mm256_set1_epi64x(10 << 40); let b = _mm256_set1_epi64x((11 << 40) + 4); let c = _mm256_set1_epi64x((12 << 40) + 3); - a = _mm256_madd52lo_epu64(a, b, c); + let actual = _mm256_madd52lo_epu64(a, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) let expected = _mm256_set1_epi64x(100055558127628); - assert_eq_m256i(a, expected); + assert_eq_m256i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_mask_madd52lo_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_mask_madd52lo_epu64(a, K, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm256_set1_epi64x(100055558127628); + expected = _mm256_mask_blend_epi64(K, a, expected); + + assert_eq_m256i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_maskz_madd52lo_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm256_set1_epi64x(100055558127628); + expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); + + assert_eq_m256i(expected, actual); } #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm_madd52hi_epu64() { - let mut a = _mm_set1_epi64x(10 << 40); + let a = _mm_set1_epi64x(10 << 40); let b = _mm_set1_epi64x((11 << 40) + 4); let c = _mm_set1_epi64x((12 << 40) + 3); - a = _mm_madd52hi_epu64(a, b, c); + let actual = _mm_madd52hi_epu64(a, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) let expected = _mm_set1_epi64x(11030549757952); - assert_eq_m128i(a, expected); + assert_eq_m128i(expected, actual); } #[simd_test(enable = "avx512ifma,avx512vl")] - unsafe fn test_mm_madd52lo_epu64() { - let mut a = _mm_set1_epi64x(10 << 40); + unsafe fn test_mm_mask_madd52hi_epu64() { + let a = _mm_set1_epi64x(10 << 40); let b = _mm_set1_epi64x((11 << 40) + 4); let c = _mm_set1_epi64x((12 << 40) + 3); - a = _mm_madd52hi_epu64(a, b, c); + let actual = _mm_mask_madd52hi_epu64(a, K, b, c); // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) - let expected = _mm_set1_epi64x(11030549757952); + let mut expected = _mm_set1_epi64x(11030549757952); + expected = _mm_mask_blend_epi64(K, a, expected); + + assert_eq_m128i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_maskz_madd52hi_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_maskz_madd52hi_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let mut expected = _mm_set1_epi64x(11030549757952); + expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); + + assert_eq_m128i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_madd52lo_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_madd52lo_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm_set1_epi64x(100055558127628); + + assert_eq_m128i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_mask_madd52lo_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_mask_madd52lo_epu64(a, K, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm_set1_epi64x(100055558127628); + expected = _mm_mask_blend_epi64(K, a, expected); + + assert_eq_m128i(expected, actual); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_maskz_madd52lo_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_maskz_madd52lo_epu64(K, a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let mut expected = _mm_set1_epi64x(100055558127628); + expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); - assert_eq_m128i(a, expected); + assert_eq_m128i(expected, actual); } } |
