diff options
| -rw-r--r-- | src/tools/miri/src/shims/x86/gfni.rs | 196 | ||||
| -rw-r--r-- | src/tools/miri/src/shims/x86/mod.rs | 8 | ||||
| -rw-r--r-- | src/tools/miri/tests/pass/shims/x86/intrinsics-x86-gfni.rs | 518 |
3 files changed, 722 insertions, 0 deletions
diff --git a/src/tools/miri/src/shims/x86/gfni.rs b/src/tools/miri/src/shims/x86/gfni.rs new file mode 100644 index 00000000000..c91b8c835f2 --- /dev/null +++ b/src/tools/miri/src/shims/x86/gfni.rs @@ -0,0 +1,196 @@ +use rustc_span::Symbol; +use rustc_target::spec::abi::Abi; + +use crate::*; + +impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {} +pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { + fn emulate_x86_gfni_intrinsic( + &mut self, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx>], + dest: &MPlaceTy<'tcx>, + ) -> InterpResult<'tcx, EmulateItemResult> { + let this = self.eval_context_mut(); + + // Prefix should have already been checked. + let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.").unwrap(); + + this.expect_target_feature_for_intrinsic(link_name, "gfni")?; + if unprefixed_name.ends_with(".256") { + this.expect_target_feature_for_intrinsic(link_name, "avx")?; + } else if unprefixed_name.ends_with(".512") { + this.expect_target_feature_for_intrinsic(link_name, "avx512f")?; + } + + match unprefixed_name { + // Used to implement the `_mm{, 256, 512}_gf2p8affine_epi64_epi8` functions. + // See `affine_transform` for details. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=gf2p8affine_ + "vgf2p8affineqb.128" | "vgf2p8affineqb.256" | "vgf2p8affineqb.512" => { + let [left, right, imm8] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + affine_transform(this, left, right, imm8, dest, /* inverse */ false)?; + } + // Used to implement the `_mm{, 256, 512}_gf2p8affineinv_epi64_epi8` functions. + // See `affine_transform` for details. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=gf2p8affineinv + "vgf2p8affineinvqb.128" | "vgf2p8affineinvqb.256" | "vgf2p8affineinvqb.512" => { + let [left, right, imm8] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + affine_transform(this, left, right, imm8, dest, /* inverse */ true)?; + } + // Used to implement the `_mm{, 256, 512}_gf2p8mul_epi8` functions. + // Multiplies packed 8-bit integers in `left` and `right` in the finite field GF(2^8) + // and store the results in `dst`. The field GF(2^8) is represented in + // polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=gf2p8mul + "vgf2p8mulb.128" | "vgf2p8mulb.256" | "vgf2p8mulb.512" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.project_to_simd(left)?; + let (right, right_len) = this.project_to_simd(right)?; + let (dest, dest_len) = this.project_to_simd(dest)?; + + assert_eq!(left_len, right_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u8()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?; + let dest = this.project_index(&dest, i)?; + this.write_scalar(Scalar::from_u8(gf2p8_mul(left, right)), &dest)?; + } + } + _ => return interp_ok(EmulateItemResult::NotSupported), + } + interp_ok(EmulateItemResult::NeedsReturn) + } +} + +/// Calculates the affine transformation `right * left + imm8` inside the finite field GF(2^8). +/// `right` is an 8x8 bit matrix, `left` and `imm8` are bit vectors. +/// If `inverse` is set, then the inverse transformation with respect to the reduction polynomial +/// x^8 + x^4 + x^3 + x + 1 is performed instead. +fn affine_transform<'tcx>( + this: &mut MiriInterpCx<'tcx>, + left: &OpTy<'tcx>, + right: &OpTy<'tcx>, + imm8: &OpTy<'tcx>, + dest: &MPlaceTy<'tcx>, + inverse: bool, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.project_to_simd(left)?; + let (right, right_len) = this.project_to_simd(right)?; + let (dest, dest_len) = this.project_to_simd(dest)?; + + assert_eq!(dest_len, right_len); + assert_eq!(dest_len, left_len); + + let imm8 = this.read_scalar(imm8)?.to_u8()?; + + // Each 8x8 bit matrix gets multiplied with eight bit vectors. + // Therefore, the iteration is done in chunks of eight. + for i in (0..dest_len).step_by(8) { + // Get the bit matrix. + let mut matrix = [0u8; 8]; + for j in 0..8 { + matrix[usize::try_from(j).unwrap()] = + this.read_scalar(&this.project_index(&right, i.wrapping_add(j))?)?.to_u8()?; + } + + // Multiply the matrix with the vector and perform the addition. + for j in 0..8 { + let index = i.wrapping_add(j); + let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u8()?; + let left = if inverse { TABLE[usize::from(left)] } else { left }; + + let mut res = 0; + + // Do the matrix multiplication. + for bit in 0u8..8 { + let mut b = matrix[usize::from(bit)] & left; + + // Calculate the parity bit. + b = (b & 0b1111) ^ (b >> 4); + b = (b & 0b11) ^ (b >> 2); + b = (b & 0b1) ^ (b >> 1); + + res |= b << 7u8.wrapping_sub(bit); + } + + // Perform the addition. + res ^= imm8; + + let dest = this.project_index(&dest, index)?; + this.write_scalar(Scalar::from_u8(res), &dest)?; + } + } + + interp_ok(()) +} + +/// A lookup table for computing the inverse byte for the inverse affine transformation. +// This is a evaluated at compile time. Trait based conversion is not available. +/// See <https://www.corsix.org/content/galois-field-instructions-2021-cpus> for the +/// definition of `gf_inv` which was used for the creation of this table. +#[allow(clippy::cast_possible_truncation)] +static TABLE: [u8; 256] = { + let mut array = [0; 256]; + + let mut i = 1; + while i < 256 { + let mut x = i as u8; + let mut y = gf2p8_mul(x, x); + x = y; + let mut j = 2; + while j < 8 { + x = gf2p8_mul(x, x); + y = gf2p8_mul(x, y); + j += 1; + } + array[i] = y; + i += 1; + } + + array +}; + +/// Multiplies packed 8-bit integers in `left` and `right` in the finite field GF(2^8) +/// and store the results in `dst`. The field GF(2^8) is represented in +/// polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. +/// See <https://www.corsix.org/content/galois-field-instructions-2021-cpus> for details. +// This is a const function. Trait based conversion is not available. +#[allow(clippy::cast_possible_truncation)] +const fn gf2p8_mul(left: u8, right: u8) -> u8 { + // This implementation is based on the `gf2p8mul_byte` definition found inside the Intel intrinsics guide. + // See https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=gf2p8mul + // for more information. + + const POLYNOMIAL: u32 = 0x11b; + + let left = left as u32; + let right = right as u32; + + let mut result = 0u32; + + let mut i = 0u32; + while i < 8 { + if left & (1 << i) != 0 { + result ^= right << i; + } + i = i.wrapping_add(1); + } + + let mut i = 14u32; + while i >= 8 { + if result & (1 << i) != 0 { + result ^= POLYNOMIAL << i.wrapping_sub(8); + } + i = i.wrapping_sub(1); + } + + result as u8 +} diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs index 19c678cb7fa..9339d301aee 100644 --- a/src/tools/miri/src/shims/x86/mod.rs +++ b/src/tools/miri/src/shims/x86/mod.rs @@ -15,6 +15,7 @@ mod aesni; mod avx; mod avx2; mod bmi; +mod gfni; mod sha; mod sse; mod sse2; @@ -106,6 +107,13 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { this, link_name, abi, args, dest, ); } + // The GFNI extension does not get its own namespace. + // Check for instruction names instead. + name if name.starts_with("vgf2p8affine") || name.starts_with("vgf2p8mulb") => { + return gfni::EvalContextExt::emulate_x86_gfni_intrinsic( + this, link_name, abi, args, dest, + ); + } name if name.starts_with("sha") => { return sha::EvalContextExt::emulate_x86_sha_intrinsic( this, link_name, abi, args, dest, diff --git a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-gfni.rs b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-gfni.rs new file mode 100644 index 00000000000..a629e2acfe9 --- /dev/null +++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-gfni.rs @@ -0,0 +1,518 @@ +// We're testing x86 target specific features +//@only-target: x86_64 i686 +//@compile-flags: -C target-feature=+gfni,+avx512f + +// The constants in the tests below are just bit patterns. They should not +// be interpreted as integers; signedness does not make sense for them, but +// __mXXXi happens to be defined in terms of signed integers. +#![allow(overflowing_literals)] +#![feature(avx512_target_feature)] +#![feature(stdarch_x86_avx512)] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::hint::black_box; +use std::mem::{size_of, transmute}; + +const IDENTITY_BYTE: i32 = 0; +const CONSTANT_BYTE: i32 = 0x63; + +fn main() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/gfni.rs + + assert!(is_x86_feature_detected!("avx512f")); + assert!(is_x86_feature_detected!("gfni")); + + unsafe { + let byte_mul_test_data = generate_byte_mul_test_data(); + let affine_mul_test_data_identity = generate_affine_mul_test_data(IDENTITY_BYTE as u8); + let affine_mul_test_data_constant = generate_affine_mul_test_data(CONSTANT_BYTE as u8); + let inv_tests_data = generate_inv_tests_data(); + + test_mm512_gf2p8mul_epi8(&byte_mul_test_data); + test_mm256_gf2p8mul_epi8(&byte_mul_test_data); + test_mm_gf2p8mul_epi8(&byte_mul_test_data); + test_mm512_gf2p8affine_epi64_epi8(&byte_mul_test_data, &affine_mul_test_data_identity); + test_mm256_gf2p8affine_epi64_epi8(&byte_mul_test_data, &affine_mul_test_data_identity); + test_mm_gf2p8affine_epi64_epi8(&byte_mul_test_data, &affine_mul_test_data_identity); + test_mm512_gf2p8affineinv_epi64_epi8(&inv_tests_data, &affine_mul_test_data_constant); + test_mm256_gf2p8affineinv_epi64_epi8(&inv_tests_data, &affine_mul_test_data_constant); + test_mm_gf2p8affineinv_epi64_epi8(&inv_tests_data, &affine_mul_test_data_constant); + } +} + +#[target_feature(enable = "gfni,avx512f")] +unsafe fn test_mm512_gf2p8mul_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), +) { + let (left, right, expected) = byte_mul_test_data; + + for i in 0..NUM_TEST_WORDS_512 { + let left = load_m512i_word(left, i); + let right = load_m512i_word(right, i); + let expected = load_m512i_word(expected, i); + let result = _mm512_gf2p8mul_epi8(left, right); + assert_eq_m512i(result, expected); + } +} + +#[target_feature(enable = "gfni,avx")] +unsafe fn test_mm256_gf2p8mul_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), +) { + let (left, right, expected) = byte_mul_test_data; + + for i in 0..NUM_TEST_WORDS_256 { + let left = load_m256i_word(left, i); + let right = load_m256i_word(right, i); + let expected = load_m256i_word(expected, i); + let result = _mm256_gf2p8mul_epi8(left, right); + assert_eq_m256i(result, expected); + } +} + +#[target_feature(enable = "gfni")] +unsafe fn test_mm_gf2p8mul_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), +) { + let (left, right, expected) = byte_mul_test_data; + + for i in 0..NUM_TEST_WORDS_128 { + let left = load_m128i_word(left, i); + let right = load_m128i_word(right, i); + let expected = load_m128i_word(expected, i); + let result = _mm_gf2p8mul_epi8(left, right); + assert_eq_m128i(result, expected); + } +} + +#[target_feature(enable = "gfni,avx512f")] +unsafe fn test_mm512_gf2p8affine_epi64_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), + affine_mul_test_data_identity: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let constant: i64 = 0; + let identity = _mm512_set1_epi64(identity); + let constant = _mm512_set1_epi64(constant); + let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8); + + let (bytes, more_bytes, _) = byte_mul_test_data; + let (matrices, vectors, references) = affine_mul_test_data_identity; + + for i in 0..NUM_TEST_WORDS_512 { + let data = load_m512i_word(bytes, i); + let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m512i(result, data); + let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m512i(result, constant_reference); + let data = load_m512i_word(more_bytes, i); + let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m512i(result, data); + let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m512i(result, constant_reference); + + let matrix = load_m512i_word(matrices, i); + let vector = load_m512i_word(vectors, i); + let reference = load_m512i_word(references, i); + + let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); + assert_eq_m512i(result, reference); + } +} + +#[target_feature(enable = "gfni,avx")] +unsafe fn test_mm256_gf2p8affine_epi64_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), + affine_mul_test_data_identity: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let constant: i64 = 0; + let identity = _mm256_set1_epi64x(identity); + let constant = _mm256_set1_epi64x(constant); + let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8); + + let (bytes, more_bytes, _) = byte_mul_test_data; + let (matrices, vectors, references) = affine_mul_test_data_identity; + + for i in 0..NUM_TEST_WORDS_256 { + let data = load_m256i_word(bytes, i); + let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m256i(result, data); + let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m256i(result, constant_reference); + let data = load_m256i_word(more_bytes, i); + let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m256i(result, data); + let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m256i(result, constant_reference); + + let matrix = load_m256i_word(matrices, i); + let vector = load_m256i_word(vectors, i); + let reference = load_m256i_word(references, i); + + let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); + assert_eq_m256i(result, reference); + } +} + +#[target_feature(enable = "gfni")] +unsafe fn test_mm_gf2p8affine_epi64_epi8( + byte_mul_test_data: &([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]), + affine_mul_test_data_identity: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let constant: i64 = 0; + let identity = _mm_set1_epi64x(identity); + let constant = _mm_set1_epi64x(constant); + let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8); + + let (bytes, more_bytes, _) = byte_mul_test_data; + let (matrices, vectors, references) = affine_mul_test_data_identity; + + for i in 0..NUM_TEST_WORDS_128 { + let data = load_m128i_word(bytes, i); + let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m128i(result, data); + let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m128i(result, constant_reference); + let data = load_m128i_word(more_bytes, i); + let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); + assert_eq_m128i(result, data); + let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); + assert_eq_m128i(result, constant_reference); + + let matrix = load_m128i_word(matrices, i); + let vector = load_m128i_word(vectors, i); + let reference = load_m128i_word(references, i); + + let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); + assert_eq_m128i(result, reference); + } +} + +#[target_feature(enable = "gfni,avx512f")] +unsafe fn test_mm512_gf2p8affineinv_epi64_epi8( + inv_tests_data: &([u8; NUM_BYTES], [u8; NUM_BYTES]), + affine_mul_test_data_constant: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let identity = _mm512_set1_epi64(identity); + + // validate inversion + let (inputs, results) = inv_tests_data; + + for i in 0..NUM_BYTES_WORDS_512 { + let input = load_m512i_word(inputs, i); + let reference = load_m512i_word(results, i); + let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); + let remultiplied = _mm512_gf2p8mul_epi8(result, input); + assert_eq_m512i(remultiplied, reference); + } + + // validate subsequent affine operation + let (matrices, vectors, _affine_expected) = affine_mul_test_data_constant; + + for i in 0..NUM_TEST_WORDS_512 { + let vector = load_m512i_word(vectors, i); + let matrix = load_m512i_word(matrices, i); + + let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); + let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); + let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); + assert_eq_m512i(result, reference); + } + + // validate everything by virtue of checking against the AES SBox + const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; + let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX); + + for i in 0..NUM_BYTES_WORDS_512 { + let reference = load_m512i_word(&AES_S_BOX, i); + let input = load_m512i_word(inputs, i); + let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); + assert_eq_m512i(result, reference); + } +} + +#[target_feature(enable = "gfni,avx")] +unsafe fn test_mm256_gf2p8affineinv_epi64_epi8( + inv_tests_data: &([u8; NUM_BYTES], [u8; NUM_BYTES]), + affine_mul_test_data_constant: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let identity = _mm256_set1_epi64x(identity); + + // validate inversion + let (inputs, results) = inv_tests_data; + + for i in 0..NUM_BYTES_WORDS_256 { + let input = load_m256i_word(inputs, i); + let reference = load_m256i_word(results, i); + let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); + let remultiplied = _mm256_gf2p8mul_epi8(result, input); + assert_eq_m256i(remultiplied, reference); + } + + // validate subsequent affine operation + let (matrices, vectors, _affine_expected) = affine_mul_test_data_constant; + + for i in 0..NUM_TEST_WORDS_256 { + let vector = load_m256i_word(vectors, i); + let matrix = load_m256i_word(matrices, i); + + let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); + let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); + let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); + assert_eq_m256i(result, reference); + } + + // validate everything by virtue of checking against the AES SBox + const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; + let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX); + + for i in 0..NUM_BYTES_WORDS_256 { + let reference = load_m256i_word(&AES_S_BOX, i); + let input = load_m256i_word(inputs, i); + let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); + assert_eq_m256i(result, reference); + } +} + +#[target_feature(enable = "gfni")] +unsafe fn test_mm_gf2p8affineinv_epi64_epi8( + inv_tests_data: &([u8; NUM_BYTES], [u8; NUM_BYTES]), + affine_mul_test_data_constant: &( + [u64; NUM_TEST_WORDS_64], + [u8; NUM_TEST_ENTRIES], + [u8; NUM_TEST_ENTRIES], + ), +) { + let identity: i64 = 0x01_02_04_08_10_20_40_80; + let identity = _mm_set1_epi64x(identity); + + // validate inversion + let (inputs, results) = inv_tests_data; + + for i in 0..NUM_BYTES_WORDS_128 { + let input = load_m128i_word(inputs, i); + let reference = load_m128i_word(results, i); + let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); + let remultiplied = _mm_gf2p8mul_epi8(result, input); + assert_eq_m128i(remultiplied, reference); + } + + // validate subsequent affine operation + let (matrices, vectors, _affine_expected) = affine_mul_test_data_constant; + + for i in 0..NUM_TEST_WORDS_128 { + let vector = load_m128i_word(vectors, i); + let matrix = load_m128i_word(matrices, i); + + let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); + let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); + let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); + assert_eq_m128i(result, reference); + } + + // validate everything by virtue of checking against the AES SBox + const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; + let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX); + + for i in 0..NUM_BYTES_WORDS_128 { + let reference = load_m128i_word(&AES_S_BOX, i); + let input = load_m128i_word(inputs, i); + let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); + assert_eq_m128i(result, reference); + } +} + +/* Various utilities for processing SIMD values. */ + +#[target_feature(enable = "sse2")] +unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i { + let byte_offset = word_index * 16 / size_of::<T>(); + let pointer = data.as_ptr().add(byte_offset) as *const __m128i; + _mm_loadu_si128(black_box(pointer)) +} + +#[target_feature(enable = "avx")] +unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i { + let byte_offset = word_index * 32 / size_of::<T>(); + let pointer = data.as_ptr().add(byte_offset) as *const __m256i; + _mm256_loadu_si256(black_box(pointer)) +} + +#[target_feature(enable = "avx512f")] +unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i { + let byte_offset = word_index * 64 / size_of::<T>(); + let pointer = data.as_ptr().add(byte_offset) as *const i32; + _mm512_loadu_si512(black_box(pointer)) +} + +#[track_caller] +#[target_feature(enable = "sse2")] +unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +} + +#[track_caller] +#[target_feature(enable = "avx")] +unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) +} + +#[track_caller] +#[target_feature(enable = "avx512f")] +unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { + assert_eq!(transmute::<_, [u64; 8]>(a), transmute::<_, [u64; 8]>(b)) +} + +/* Software implementation of the hardware intrinsics. */ + +fn mulbyte(left: u8, right: u8) -> u8 { + // this implementation follows the description in + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8 + const REDUCTION_POLYNOMIAL: u16 = 0x11b; + let left: u16 = left.into(); + let right: u16 = right.into(); + let mut carryless_product: u16 = 0; + + // Carryless multiplication + for i in 0..8 { + if ((left >> i) & 0x01) != 0 { + carryless_product ^= right << i; + } + } + + // reduction, adding in "0" where appropriate to clear out high bits + // note that REDUCTION_POLYNOMIAL is zero in this context + for i in (8..=14).rev() { + if ((carryless_product >> i) & 0x01) != 0 { + carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8); + } + } + + carryless_product as u8 +} + +/// Calculates the bitwise XOR of all bits inside a byte. +fn parity(input: u8) -> u8 { + let mut accumulator = 0; + for i in 0..8 { + accumulator ^= (input >> i) & 0x01; + } + accumulator +} + +/// Calculates `matrix * x + b` inside the finite field GF(2). +fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 { + // this implementation follows the description in + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8 + let mut accumulator = 0; + + for bit in 0..8 { + accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit); + } + + accumulator ^ b +} + +/* Test data generation. */ + +const NUM_TEST_WORDS_512: usize = 4; +const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2; +const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2; +const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64; +const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2; +const NUM_BYTES: usize = 256; +const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16; +const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2; +const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2; + +fn generate_affine_mul_test_data( + immediate: u8, +) -> ([u64; NUM_TEST_WORDS_64], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]) { + let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64]; + let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; + let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; + + for i in 0..NUM_TEST_WORDS_64 { + left[i] = (i as u64) * 103 * 101; + for j in 0..8 { + let j64 = j as u64; + right[i * 8 + j] = ((left[i] + j64) % 256) as u8; + result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate); + } + } + + (left, right, result) +} + +fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) { + let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES]; + let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES]; + + for i in 0..NUM_BYTES { + input[i] = (i % 256) as u8; + result[i] = if i == 0 { 0 } else { 1 }; + } + + (input, result) +} + +const AES_S_BOX: [u8; NUM_BYTES] = [ + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +]; + +fn generate_byte_mul_test_data() +-> ([u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES]) { + let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; + let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; + let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; + + for i in 0..NUM_TEST_ENTRIES { + left[i] = (i % 256) as u8; + right[i] = left[i].wrapping_mul(101); + result[i] = mulbyte(left[i], right[i]); + } + + (left, right, result) +} |
