about summary refs log tree commit diff
path: root/library/stdarch/crates
diff options
context:
space:
mode:
authorgnzlbg <gonzalobg88@gmail.com>2019-01-21 16:59:10 +0100
committergnzlbg <gnzlbg@users.noreply.github.com>2019-01-22 17:04:25 +0100
commit11c624e488663f4f7554d1f92a072c7caee3908e (patch)
tree0523ed5fac723731d10ef7110a0c36923cb545a2 /library/stdarch/crates
parent3ca14c6fecdf36ad5c5beca759d2ed7f6a0e1e5f (diff)
downloadrust-11c624e488663f4f7554d1f92a072c7caee3908e.tar.gz
rust-11c624e488663f4f7554d1f92a072c7caee3908e.zip
Refactor stdsimd
This commit:

* renames `coresimd` to `core_arch` and `stdsimd` to `std_detect`

* `std_detect` does no longer depend on `core_arch` - it is a freestanding
  `no_std` library that only depends on `core` - it is renamed to `std_detect`

* moves the top-level coresimd and stdsimd directories into the appropriate
  crates/... directories - this simplifies creating crate.io releases of these crates

* moves the top-level `coresimd` and `stdsimd` sub-directories into their
  corresponding crates in `crates/{core_arch, std_detect}`.
Diffstat (limited to 'library/stdarch/crates')
-rw-r--r--library/stdarch/crates/core_arch/Cargo.toml (renamed from library/stdarch/crates/coresimd/Cargo.toml)8
-rw-r--r--library/stdarch/crates/core_arch/build.rs (renamed from library/stdarch/crates/coresimd/build.rs)0
-rwxr-xr-xlibrary/stdarch/crates/core_arch/foo.wasm (renamed from library/stdarch/crates/coresimd/foo.wasm)bin252 -> 252 bytes
-rw-r--r--library/stdarch/crates/core_arch/src/aarch64/crc.rs143
-rw-r--r--library/stdarch/crates/core_arch/src/aarch64/crypto.rs333
-rw-r--r--library/stdarch/crates/core_arch/src/aarch64/mod.rs29
-rw-r--r--library/stdarch/crates/core_arch/src/aarch64/neon.rs2022
-rw-r--r--library/stdarch/crates/core_arch/src/aarch64/v8.rs105
-rw-r--r--library/stdarch/crates/core_arch/src/arm/armclang.rs68
-rw-r--r--library/stdarch/crates/core_arch/src/arm/cmsis.rs330
-rw-r--r--library/stdarch/crates/core_arch/src/arm/dsp.rs654
-rw-r--r--library/stdarch/crates/core_arch/src/arm/mod.rs56
-rw-r--r--library/stdarch/crates/core_arch/src/arm/neon.rs1420
-rw-r--r--library/stdarch/crates/core_arch/src/arm/table_lookup_tests.rs1042
-rw-r--r--library/stdarch/crates/core_arch/src/arm/v6.rs49
-rw-r--r--library/stdarch/crates/core_arch/src/arm/v7.rs89
-rw-r--r--library/stdarch/crates/core_arch/src/lib.rs (renamed from library/stdarch/crates/coresimd/src/lib.rs)16
-rw-r--r--library/stdarch/crates/core_arch/src/macros.rs282
-rw-r--r--library/stdarch/crates/core_arch/src/mips/mod.rs14
-rw-r--r--library/stdarch/crates/core_arch/src/mips/msa.rs62
-rw-r--r--library/stdarch/crates/core_arch/src/mod.rs506
-rw-r--r--library/stdarch/crates/core_arch/src/nvptx/mod.rs126
-rw-r--r--library/stdarch/crates/core_arch/src/powerpc/altivec.rs1488
-rw-r--r--library/stdarch/crates/core_arch/src/powerpc/mod.rs19
-rw-r--r--library/stdarch/crates/core_arch/src/powerpc/vsx.rs116
-rw-r--r--library/stdarch/crates/core_arch/src/powerpc64/mod.rs8
-rw-r--r--library/stdarch/crates/core_arch/src/simd.rs191
-rw-r--r--library/stdarch/crates/core_arch/src/simd_llvm.rs67
-rw-r--r--library/stdarch/crates/core_arch/src/v64.rs85
-rw-r--r--library/stdarch/crates/core_arch/src/wasm32/atomic.rs120
-rw-r--r--library/stdarch/crates/core_arch/src/wasm32/memory.rs64
-rw-r--r--library/stdarch/crates/core_arch/src/wasm32/mod.rs26
-rw-r--r--library/stdarch/crates/core_arch/src/wasm32/simd128.rs2146
-rw-r--r--library/stdarch/crates/core_arch/src/x86/abm.rs62
-rw-r--r--library/stdarch/crates/core_arch/src/x86/adx.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86/aes.rs175
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx.rs5060
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx2.rs6230
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512f.rs193
-rw-r--r--library/stdarch/crates/core_arch/src/x86/bmi1.rs178
-rw-r--r--library/stdarch/crates/core_arch/src/x86/bmi2.rs133
-rw-r--r--library/stdarch/crates/core_arch/src/x86/bswap.rs35
-rw-r--r--library/stdarch/crates/core_arch/src/x86/cpuid.rs187
-rw-r--r--library/stdarch/crates/core_arch/src/x86/eflags.rs83
-rw-r--r--library/stdarch/crates/core_arch/src/x86/fma.rs802
-rw-r--r--library/stdarch/crates/core_arch/src/x86/fxsr.rs112
-rw-r--r--library/stdarch/crates/core_arch/src/x86/macros.rs109
-rw-r--r--library/stdarch/crates/core_arch/src/x86/mmx.rs794
-rw-r--r--library/stdarch/crates/core_arch/src/x86/mod.rs617
-rw-r--r--library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs74
-rw-r--r--library/stdarch/crates/core_arch/src/x86/rdrand.rs76
-rw-r--r--library/stdarch/crates/core_arch/src/x86/rdtsc.rs77
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sha.rs224
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse.rs4161
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse2.rs5253
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse3.rs255
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse41.rs1938
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse42.rs941
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse4a.rs159
-rw-r--r--library/stdarch/crates/core_arch/src/x86/ssse3.rs898
-rw-r--r--library/stdarch/crates/core_arch/src/x86/tbm.rs460
-rw-r--r--library/stdarch/crates/core_arch/src/x86/test.rs145
-rw-r--r--library/stdarch/crates/core_arch/src/x86/xsave.rs285
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/abm.rs62
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/adx.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/avx.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/avx2.rs49
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/bmi.rs184
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/bmi2.rs139
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/bswap.rs35
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs75
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/fxsr.rs112
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/mod.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/rdrand.rs43
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/sse.rs152
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/sse2.rs210
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/sse41.rs59
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/sse42.rs37
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/xsave.rs227
-rw-r--r--library/stdarch/crates/core_arch/tests/cpu-detection.rs (renamed from library/stdarch/crates/coresimd/tests/cpu-detection.rs)2
-rw-r--r--library/stdarch/crates/std_detect/Cargo.toml (renamed from library/stdarch/crates/stdsimd/Cargo.toml)25
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs103
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/arm.rs36
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/mips.rs26
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/mips64.rs26
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs39
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs39
-rw-r--r--library/stdarch/crates/std_detect/src/detect/arch/x86.rs331
-rw-r--r--library/stdarch/crates/std_detect/src/detect/bit.rs9
-rw-r--r--library/stdarch/crates/std_detect/src/detect/cache.rs162
-rw-r--r--library/stdarch/crates/std_detect/src/detect/error_macros.rs150
-rw-r--r--library/stdarch/crates/std_detect/src/detect/mod.rs85
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/aarch64.rs79
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs28
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs14
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs157
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs49
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs270
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs301
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs31
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs26
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs41
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/other.rs9
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/x86.rs357
-rw-r--r--library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxvbin0 -> 160 bytes
-rw-r--r--library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxvbin0 -> 304 bytes
-rw-r--r--library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxvbin0 -> 160 bytes
-rw-r--r--library/stdarch/crates/std_detect/src/lib.rs37
-rw-r--r--library/stdarch/crates/std_detect/src/mod.rs5
-rw-r--r--library/stdarch/crates/std_detect/tests/cpu-detection.rs (renamed from library/stdarch/crates/stdsimd/tests/cpu-detection.rs)2
-rw-r--r--library/stdarch/crates/stdsimd-verify/build.rs10
-rw-r--r--library/stdarch/crates/stdsimd-verify/src/lib.rs4
-rw-r--r--library/stdarch/crates/stdsimd/src/lib.rs41
113 files changed, 45377 insertions, 85 deletions
diff --git a/library/stdarch/crates/coresimd/Cargo.toml b/library/stdarch/crates/core_arch/Cargo.toml
index d96672baf29..09bf17bc611 100644
--- a/library/stdarch/crates/coresimd/Cargo.toml
+++ b/library/stdarch/crates/core_arch/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "coresimd"
+name = "core_arch"
 version = "0.1.3"
 authors = [
     "Alex Crichton <alex@alexcrichton.com>",
@@ -7,11 +7,11 @@ authors = [
     "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
 ]
 description = "SIMD support in Rust's core library."
-documentation = "https://docs.rs/stdsimd"
+documentation = "https://docs.rs/core_arch"
 homepage = "https://github.com/rust-lang-nursery/stdsimd"
 repository = "https://github.com/rust-lang-nursery/stdsimd"
 readme = "README.md"
-keywords = ["core", "simd", "intrinsics"]
+keywords = ["core", "simd", "arch", "intrinsics"]
 categories = ["hardware-support", "no-std"]
 license = "MIT/Apache-2.0"
 
@@ -24,7 +24,7 @@ maintenance = { status = "experimental" }
 
 [dev-dependencies]
 stdsimd-test = { version = "0.*", path = "../stdsimd-test" }
-stdsimd = { version = "0.1.3", path = "../stdsimd" }
+std_detect = { version = "0.1.3", path = "../std_detect" }
 
 [target.wasm32-unknown-unknown.dev-dependencies]
 wasm-bindgen-test = "=0.2.19"
diff --git a/library/stdarch/crates/coresimd/build.rs b/library/stdarch/crates/core_arch/build.rs
index f497e1830ee..f497e1830ee 100644
--- a/library/stdarch/crates/coresimd/build.rs
+++ b/library/stdarch/crates/core_arch/build.rs
diff --git a/library/stdarch/crates/coresimd/foo.wasm b/library/stdarch/crates/core_arch/foo.wasm
index 34e11336649..34e11336649 100755
--- a/library/stdarch/crates/coresimd/foo.wasm
+++ b/library/stdarch/crates/core_arch/foo.wasm
Binary files differdiff --git a/library/stdarch/crates/core_arch/src/aarch64/crc.rs b/library/stdarch/crates/core_arch/src/aarch64/crc.rs
new file mode 100644
index 00000000000..f43b163678a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/crc.rs
@@ -0,0 +1,143 @@
+extern "C" {
+    #[link_name = "llvm.aarch64.crc32b"]
+    fn crc32b_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32h"]
+    fn crc32h_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32w"]
+    fn crc32w_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32x"]
+    fn crc32x_(crc: u32, data: u64) -> u32;
+
+    #[link_name = "llvm.aarch64.crc32cb"]
+    fn crc32cb_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32ch"]
+    fn crc32ch_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32cw"]
+    fn crc32cw_(crc: u32, data: u32) -> u32;
+    #[link_name = "llvm.aarch64.crc32cx"]
+    fn crc32cx_(crc: u32, data: u64) -> u32;
+}
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// CRC32 single round checksum for bytes (8 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32b))]
+pub unsafe fn __crc32b(crc: u32, data: u8) -> u32 {
+    crc32b_(crc, data as u32)
+}
+
+/// CRC32 single round checksum for half words (16 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32h))]
+pub unsafe fn __crc32h(crc: u32, data: u16) -> u32 {
+    crc32h_(crc, data as u32)
+}
+
+/// CRC32 single round checksum for words (32 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32w))]
+pub unsafe fn __crc32w(crc: u32, data: u32) -> u32 {
+    crc32w_(crc, data)
+}
+
+/// CRC32 single round checksum for quad words (64 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32x))]
+pub unsafe fn __crc32d(crc: u32, data: u64) -> u32 {
+    crc32x_(crc, data)
+}
+
+/// CRC32-C single round checksum for bytes (8 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32cb))]
+pub unsafe fn __crc32cb(crc: u32, data: u8) -> u32 {
+    crc32cb_(crc, data as u32)
+}
+
+/// CRC32-C single round checksum for half words (16 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32ch))]
+pub unsafe fn __crc32ch(crc: u32, data: u16) -> u32 {
+    crc32ch_(crc, data as u32)
+}
+
+/// CRC32-C single round checksum for words (32 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32cw))]
+pub unsafe fn __crc32cw(crc: u32, data: u32) -> u32 {
+    crc32cw_(crc, data)
+}
+
+/// CRC32-C single round checksum for quad words (64 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32cx))]
+pub unsafe fn __crc32cd(crc: u32, data: u64) -> u32 {
+    crc32cx_(crc, data)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::aarch64::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32b() {
+        assert_eq!(__crc32b(0, 0), 0);
+        assert_eq!(__crc32b(0, 255), 755167117);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32h() {
+        assert_eq!(__crc32h(0, 0), 0);
+        assert_eq!(__crc32h(0, 16384), 1994146192);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32w() {
+        assert_eq!(__crc32w(0, 0), 0);
+        assert_eq!(__crc32w(0, 4294967295), 3736805603);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32d() {
+        assert_eq!(__crc32d(0, 0), 0);
+        assert_eq!(__crc32d(0, 18446744073709551615), 1147535477);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cb() {
+        assert_eq!(__crc32cb(0, 0), 0);
+        assert_eq!(__crc32cb(0, 255), 2910671697);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32ch() {
+        assert_eq!(__crc32ch(0, 0), 0);
+        assert_eq!(__crc32ch(0, 16384), 1098587580);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cw() {
+        assert_eq!(__crc32cw(0, 0), 0);
+        assert_eq!(__crc32cw(0, 4294967295), 3080238136);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cd() {
+        assert_eq!(__crc32cd(0, 0), 0);
+        assert_eq!(__crc32cd(0, 18446744073709551615), 3293575501);
+    }
+
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/crypto.rs b/library/stdarch/crates/core_arch/src/aarch64/crypto.rs
new file mode 100644
index 00000000000..1676d37901a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/crypto.rs
@@ -0,0 +1,333 @@
+use core_arch::arm::uint32x4_t;
+use core_arch::arm::uint8x16_t;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.aarch64.crypto.aese"]
+    fn vaeseq_u8_(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.crypto.aesd"]
+    fn vaesdq_u8_(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.crypto.aesmc"]
+    fn vaesmcq_u8_(data: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.crypto.aesimc"]
+    fn vaesimcq_u8_(data: uint8x16_t) -> uint8x16_t;
+
+    #[link_name = "llvm.aarch64.crypto.sha1h"]
+    fn vsha1h_u32_(hash_e: u32) -> u32;
+    #[link_name = "llvm.aarch64.crypto.sha1su0"]
+    fn vsha1su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha1su1"]
+    fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha1c"]
+    fn vsha1cq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha1p"]
+    fn vsha1pq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha1m"]
+    fn vsha1mq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+
+    #[link_name = "llvm.aarch64.crypto.sha256h"]
+    fn vsha256hq_u32_(hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha256h2"]
+    fn vsha256h2q_u32_(hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha256su0"]
+    fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.crypto.sha256su1"]
+    fn vsha256su1q_u32_(tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
+}
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// AES single round encryption.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(aese))]
+pub unsafe fn vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    vaeseq_u8_(data, key)
+}
+
+/// AES single round decryption.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(aesd))]
+pub unsafe fn vaesdq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    vaesdq_u8_(data, key)
+}
+
+/// AES mix columns.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(aesmc))]
+pub unsafe fn vaesmcq_u8(data: uint8x16_t) -> uint8x16_t {
+    vaesmcq_u8_(data)
+}
+
+/// AES inverse mix columns.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(aesimc))]
+pub unsafe fn vaesimcq_u8(data: uint8x16_t) -> uint8x16_t {
+    vaesimcq_u8_(data)
+}
+
+/// SHA1 fixed rotate.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1h))]
+pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 {
+    vsha1h_u32_(hash_e)
+}
+
+/// SHA1 hash update accelerator, choose.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1c))]
+pub unsafe fn vsha1cq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1cq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 hash update accelerator, majority.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1m))]
+pub unsafe fn vsha1mq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1mq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 hash update accelerator, parity.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1p))]
+pub unsafe fn vsha1pq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1pq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 schedule update accelerator, first part.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1su0))]
+pub unsafe fn vsha1su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t {
+    vsha1su0q_u32_(w0_3, w4_7, w8_11)
+}
+
+/// SHA1 schedule update accelerator, second part.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha1su1))]
+pub unsafe fn vsha1su1q_u32(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t {
+    vsha1su1q_u32_(tw0_3, w12_15)
+}
+
+/// SHA256 hash update accelerator.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha256h))]
+pub unsafe fn vsha256hq_u32(
+    hash_abcd: uint32x4_t,
+    hash_efgh: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    vsha256hq_u32_(hash_abcd, hash_efgh, wk)
+}
+
+/// SHA256 hash update accelerator, upper part.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha256h2))]
+pub unsafe fn vsha256h2q_u32(
+    hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    vsha256h2q_u32_(hash_efgh, hash_abcd, wk)
+}
+
+/// SHA256 schedule update accelerator, first part.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha256su0))]
+pub unsafe fn vsha256su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    vsha256su0q_u32_(w0_3, w4_7)
+}
+
+/// SHA256 schedule update accelerator, second part.
+#[inline]
+#[target_feature(enable = "crypto")]
+#[cfg_attr(test, assert_instr(sha256su1))]
+pub unsafe fn vsha256su1q_u32(
+    tw0_3: uint32x4_t,
+    w8_11: uint32x4_t,
+    w12_15: uint32x4_t,
+) -> uint32x4_t {
+    vsha256su1q_u32_(tw0_3, w8_11, w12_15)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::aarch64::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vaeseq_u8() {
+        let data = ::mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let key = ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let r: u8x16 = ::mem::transmute(vaeseq_u8(data, key));
+        assert_eq!(
+            r,
+            u8x16::new(
+                124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118, 124, 123, 124, 197
+            )
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vaesdq_u8() {
+        let data = ::mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let key = ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let r: u8x16 = ::mem::transmute(vaesdq_u8(data, key));
+        assert_eq!(
+            r,
+            u8x16::new(9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vaesmcq_u8() {
+        let data = ::mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let r: u8x16 = ::mem::transmute(vaesmcq_u8(data));
+        assert_eq!(
+            r,
+            u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vaesimcq_u8() {
+        let data = ::mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let r: u8x16 = ::mem::transmute(vaesimcq_u8(data));
+        assert_eq!(
+            r,
+            u8x16::new(43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80, 125, 70)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1h_u32() {
+        assert_eq!(vsha1h_u32(0x1234), 0x048d);
+        assert_eq!(vsha1h_u32(0x5678), 0x159e);
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1su0q_u32() {
+        let r: u32x4 = ::mem::transmute(vsha1su0q_u32(
+            ::mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+            ::mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+            ::mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+        ));
+        assert_eq!(r, u32x4::new(0x9abc, 0xdef0, 0x1234, 0x5678));
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1su1q_u32() {
+        let r: u32x4 = ::mem::transmute(vsha1su1q_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x00008898, 0x00019988, 0x00008898, 0x0000acd0)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1cq_u32() {
+        let r: u32x4 = ::mem::transmute(vsha1cq_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x8a32cbd8, 0x0c518a96, 0x0018a081, 0x0000c168)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1pq_u32() {
+        let r: u32x4 = ::mem::transmute(vsha1pq_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x469f0ba3, 0x0a326147, 0x80145d7f, 0x00009f47)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha1mq_u32() {
+        let r: u32x4 = ::mem::transmute(vsha1mq_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0xaa39693b, 0x0d51bf84, 0x001aa109, 0x0000d278)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha256hq_u32() {
+        let r: u32x4 = ::mem::transmute(vsha256hq_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x05e9aaa8, 0xec5f4c02, 0x20a1ea61, 0x28738cef)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha256h2q_u32() {
+        let r: u32x4 = ::mem::transmute(vsha256h2q_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x3745362e, 0x2fb51d00, 0xbd4c529b, 0x968b8516)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha256su0q_u32() {
+        let r: u32x4 = ::mem::transmute(vsha256su0q_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0xe59e1c97, 0x5eaf68da, 0xd7bcb51f, 0x6c8de152)
+        );
+    }
+
+    #[simd_test(enable = "crypto")]
+    unsafe fn test_vsha256su1q_u32() {
+        let r: u32x4 = ::mem::transmute(vsha256su1q_u32(
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            ::mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x5e09e8d2, 0x74a6f16b, 0xc966606b, 0xa686ee9f)
+        );
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
new file mode 100644
index 00000000000..d573e2c0b83
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
@@ -0,0 +1,29 @@
+//! AArch64 intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+mod v8;
+pub use self::v8::*;
+
+mod neon;
+pub use self::neon::*;
+
+mod crypto;
+pub use self::crypto::*;
+
+mod crc;
+pub use self::crc::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Generates the trap instruction `BRK 1`
+#[cfg_attr(test, assert_instr(brk))]
+#[inline]
+pub unsafe fn brk() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon.rs b/library/stdarch/crates/core_arch/src/aarch64/neon.rs
new file mode 100644
index 00000000000..0c43810dfa1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon.rs
@@ -0,0 +1,2022 @@
+//! ARMv8 ASIMD intrinsics
+
+#![allow(non_camel_case_types)]
+
+// FIXME: replace neon with asimd
+
+use core_arch::arm::*;
+use core_arch::simd_llvm::*;
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+use mem;
+
+types! {
+    /// ARM-specific 64-bit wide vector of one packed `f64`.
+    pub struct float64x1_t(f64); // FIXME: check this!
+    /// ARM-specific 128-bit wide vector of two packed `f64`.
+    pub struct float64x2_t(f64, f64);
+    /// ARM-specific 64-bit wide vector of one packed `p64`.
+    pub struct poly64x1_t(i64); // FIXME: check this!
+    /// ARM-specific 64-bit wide vector of two packed `p64`.
+    pub struct poly64x2_t(i64, i64); // FIXME: check this!
+}
+
+/// ARM-specific type containing two `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x2_t(pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing three `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x3_t(pub int8x16_t, pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing four `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x4_t(pub int8x16_t, pub int8x16_t, pub int8x16_t, pub int8x16_t);
+
+/// ARM-specific type containing two `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x2_t(pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing three `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x3_t(pub uint8x16_t, pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing four `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x4_t(
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+);
+
+/// ARM-specific type containing two `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x2_t(pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing three `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x3_t(pub poly8x16_t, pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing four `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x4_t(
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+);
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"]
+    fn vmaxv_s8_(a: int8x8_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.smaxv.i8.6i8"]
+    fn vmaxvq_s8_(a: int8x16_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"]
+    fn vmaxv_s16_(a: int16x4_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"]
+    fn vmaxvq_s16_(a: int16x8_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"]
+    fn vmaxv_s32_(a: int32x2_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"]
+    fn vmaxvq_s32_(a: int32x4_t) -> i32;
+
+    #[link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"]
+    fn vmaxv_u8_(a: uint8x8_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.umaxv.i8.6i8"]
+    fn vmaxvq_u8_(a: uint8x16_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"]
+    fn vmaxv_u16_(a: uint16x4_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"]
+    fn vmaxvq_u16_(a: uint16x8_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"]
+    fn vmaxv_u32_(a: uint32x2_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"]
+    fn vmaxvq_u32_(a: uint32x4_t) -> u32;
+
+    #[link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"]
+    fn vmaxv_f32_(a: float32x2_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fmaxv.f32.v4f32"]
+    fn vmaxvq_f32_(a: float32x4_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"]
+    fn vmaxvq_f64_(a: float64x2_t) -> f64;
+
+    #[link_name = "llvm.aarch64.neon.sminv.i8.v8i8"]
+    fn vminv_s8_(a: int8x8_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.sminv.i8.6i8"]
+    fn vminvq_s8_(a: int8x16_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.sminv.i16.v4i16"]
+    fn vminv_s16_(a: int16x4_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.sminv.i16.v8i16"]
+    fn vminvq_s16_(a: int16x8_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.sminv.i32.v2i32"]
+    fn vminv_s32_(a: int32x2_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.sminv.i32.v4i32"]
+    fn vminvq_s32_(a: int32x4_t) -> i32;
+
+    #[link_name = "llvm.aarch64.neon.uminv.i8.v8i8"]
+    fn vminv_u8_(a: uint8x8_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.uminv.i8.6i8"]
+    fn vminvq_u8_(a: uint8x16_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.uminv.i16.v4i16"]
+    fn vminv_u16_(a: uint16x4_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uminv.i16.v8i16"]
+    fn vminvq_u16_(a: uint16x8_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uminv.i32.v2i32"]
+    fn vminv_u32_(a: uint32x2_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.uminv.i32.v4i32"]
+    fn vminvq_u32_(a: uint32x4_t) -> u32;
+
+    #[link_name = "llvm.aarch64.neon.fminv.f32.v2f32"]
+    fn vminv_f32_(a: float32x2_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fminv.f32.v4f32"]
+    fn vminvq_f32_(a: float32x4_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fminv.f64.v2f64"]
+    fn vminvq_f64_(a: float64x2_t) -> f64;
+
+    #[link_name = "llvm.aarch64.neon.sminp.v16i8"]
+    fn vpminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.sminp.v8i16"]
+    fn vpminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.sminp.v4i32"]
+    fn vpminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v16i8"]
+    fn vpminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v8i16"]
+    fn vpminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v4i32"]
+    fn vpminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.fminp.4f32"]
+    fn vpminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    #[link_name = "llvm.aarch64.neon.fminp.v2f64"]
+    fn vpminq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.smaxp.v16i8"]
+    fn vpmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.smaxp.v8i16"]
+    fn vpmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.smaxp.v4i32"]
+    fn vpmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v16i8"]
+    fn vpmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v8i16"]
+    fn vpmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v4i32"]
+    fn vpmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.fmaxp.4f32"]
+    fn vpmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    #[link_name = "llvm.aarch64.neon.fmaxp.v2f64"]
+    fn vpmaxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl1.v8i8"]
+    fn vqtbl1(a: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl1.v16i8"]
+    fn vqtbl1q(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx1.v8i8"]
+    fn vqtbx1(a: int8x8_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx1.v16i8"]
+    fn vqtbx1q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl2.v8i8"]
+    fn vqtbl2(a0: int8x16_t, a1: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl2.v16i8"]
+    fn vqtbl2q(a0: int8x16_t, a1: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx2.v8i8"]
+    fn vqtbx2(a: int8x8_t, b0: int8x16_t, b1: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx2.v16i8"]
+    fn vqtbx2q(a: int8x16_t, b0: int8x16_t, b1: int8x16_t, c: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl3.v8i8"]
+    fn vqtbl3(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl3.v16i8"]
+    fn vqtbl3q(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx3.v8i8"]
+    fn vqtbx3(a: int8x8_t, b0: int8x16_t, b1: int8x16_t, b2: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx3.v16i8"]
+    fn vqtbx3q(
+        a: int8x16_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        c: uint8x16_t,
+    ) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl4.v8i8"]
+    fn vqtbl4(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, a3: int8x16_t, b: uint8x8_t)
+        -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl4.v16i8"]
+    fn vqtbl4q(
+        a0: int8x16_t,
+        a1: int8x16_t,
+        a2: int8x16_t,
+        a3: int8x16_t,
+        b: uint8x16_t,
+    ) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx4.v8i8"]
+    fn vqtbx4(
+        a: int8x8_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        b3: int8x16_t,
+        c: uint8x8_t,
+    ) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx4.v16i8"]
+    fn vqtbx4q(
+        a: int8x16_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        b3: int8x16_t,
+        c: uint8x16_t,
+    ) -> int8x16_t;
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+pub unsafe fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+pub unsafe fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
+    let a: int64x1_t = mem::transmute(a);
+    let b: int64x1_t = mem::transmute(b);
+    simd_extract(simd_add(a, b), 0)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
+    let a: uint64x1_t = mem::transmute(a);
+    let b: uint64x1_t = mem::transmute(b);
+    simd_extract(simd_add(a, b), 0)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub unsafe fn vmaxv_s8(a: int8x8_t) -> i8 {
+    vmaxv_s8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub unsafe fn vmaxvq_s8(a: int8x16_t) -> i8 {
+    vmaxvq_s8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub unsafe fn vmaxv_s16(a: int16x4_t) -> i16 {
+    vmaxv_s16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub unsafe fn vmaxvq_s16(a: int16x8_t) -> i16 {
+    vmaxvq_s16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub unsafe fn vmaxv_s32(a: int32x2_t) -> i32 {
+    vmaxv_s32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+pub unsafe fn vmaxvq_s32(a: int32x4_t) -> i32 {
+    vmaxvq_s32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub unsafe fn vmaxv_u8(a: uint8x8_t) -> u8 {
+    vmaxv_u8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub unsafe fn vmaxvq_u8(a: uint8x16_t) -> u8 {
+    vmaxvq_u8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub unsafe fn vmaxv_u16(a: uint16x4_t) -> u16 {
+    vmaxv_u16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub unsafe fn vmaxvq_u16(a: uint16x8_t) -> u16 {
+    vmaxvq_u16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub unsafe fn vmaxv_u32(a: uint32x2_t) -> u32 {
+    vmaxv_u32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+pub unsafe fn vmaxvq_u32(a: uint32x4_t) -> u32 {
+    vmaxvq_u32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub unsafe fn vmaxv_f32(a: float32x2_t) -> f32 {
+    vmaxv_f32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxv))]
+pub unsafe fn vmaxvq_f32(a: float32x4_t) -> f32 {
+    vmaxvq_f32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub unsafe fn vmaxvq_f64(a: float64x2_t) -> f64 {
+    vmaxvq_f64_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub unsafe fn vminv_s8(a: int8x8_t) -> i8 {
+    vminv_s8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub unsafe fn vminvq_s8(a: int8x16_t) -> i8 {
+    vminvq_s8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub unsafe fn vminv_s16(a: int16x4_t) -> i16 {
+    vminv_s16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub unsafe fn vminvq_s16(a: int16x8_t) -> i16 {
+    vminvq_s16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub unsafe fn vminv_s32(a: int32x2_t) -> i32 {
+    vminv_s32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+pub unsafe fn vminvq_s32(a: int32x4_t) -> i32 {
+    vminvq_s32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub unsafe fn vminv_u8(a: uint8x8_t) -> u8 {
+    vminv_u8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub unsafe fn vminvq_u8(a: uint8x16_t) -> u8 {
+    vminvq_u8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub unsafe fn vminv_u16(a: uint16x4_t) -> u16 {
+    vminv_u16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub unsafe fn vminvq_u16(a: uint16x8_t) -> u16 {
+    vminvq_u16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub unsafe fn vminv_u32(a: uint32x2_t) -> u32 {
+    vminv_u32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+pub unsafe fn vminvq_u32(a: uint32x4_t) -> u32 {
+    vminvq_u32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub unsafe fn vminv_f32(a: float32x2_t) -> f32 {
+    vminv_f32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminv))]
+pub unsafe fn vminvq_f32(a: float32x4_t) -> f32 {
+    vminvq_f32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub unsafe fn vminvq_f64(a: float64x2_t) -> f64 {
+    vminvq_f64_(a)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub unsafe fn vpminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vpminq_s8_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub unsafe fn vpminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vpminq_s16_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+pub unsafe fn vpminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vpminq_s32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub unsafe fn vpminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpminq_u8_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub unsafe fn vpminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    vpminq_u16_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+pub unsafe fn vpminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    vpminq_u32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub unsafe fn vpminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    vpminq_f32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+pub unsafe fn vpminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    vpminq_f64_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub unsafe fn vpmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vpmaxq_s8_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub unsafe fn vpmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vpmaxq_s16_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+pub unsafe fn vpmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vpmaxq_s32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub unsafe fn vpmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpmaxq_u8_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub unsafe fn vpmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    vpmaxq_u16_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+pub unsafe fn vpmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    vpmaxq_u32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub unsafe fn vpmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    vpmaxq_f32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    vpmaxq_f64_(a, b)
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
+    simd_shuffle16(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
+    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
+    simd_shuffle4(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
+    simd_shuffle2(low, high, [0, 1])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
+    simd_shuffle16(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
+    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
+    simd_shuffle4(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
+    simd_shuffle2(low, high, [0, 1])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t {
+    simd_shuffle2(low, high, [0, 1])
+}
+
+/* FIXME: 16-bit float
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_t {
+    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+*/
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
+    simd_shuffle4(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
+    simd_shuffle16(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
+    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
+    simd_shuffle2(low, high, [0, 1])
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vqtbl1_s8(vcombine_s8(a, ::mem::zeroed()), ::mem::transmute(b))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl1_u8(vcombine_u8(a, ::mem::zeroed()), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl1_p8(vcombine_p8(a, ::mem::zeroed()), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vqtbl1_s8(vcombine_s8(a.0, a.1), ::mem::transmute(b))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl1_u8(vcombine_u8(a.0, a.1), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl1_p8(vcombine_p8(a.0, a.1), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vqtbl2_s8(
+        int8x16x2_t(vcombine_s8(a.0, a.1), vcombine_s8(a.2, ::mem::zeroed())),
+        ::mem::transmute(b),
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl2_u8(
+        uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, ::mem::zeroed())),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl2_p8(
+        poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, ::mem::zeroed())),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vqtbl2_s8(
+        int8x16x2_t(vcombine_s8(a.0, a.1), vcombine_s8(a.2, a.3)),
+        ::mem::transmute(b),
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl2_u8(
+        uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, a.3)),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl2_p8(
+        poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, a.3)),
+        b,
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    use core_arch::simd::i8x8;
+    let r = vqtbx1_s8(a, vcombine_s8(b, ::mem::zeroed()), ::mem::transmute(c));
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(i8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    use core_arch::simd::u8x8;
+    let r = vqtbx1_u8(a, vcombine_u8(b, ::mem::zeroed()), c);
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(u8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    use core_arch::simd::u8x8;
+    let r = vqtbx1_p8(a, vcombine_p8(b, ::mem::zeroed()), c);
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(u8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vqtbx1_s8(a, vcombine_s8(b.0, b.1), ::mem::transmute(c))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    vqtbx1_u8(a, vcombine_u8(b.0, b.1), c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    vqtbx1_p8(a, vcombine_p8(b.0, b.1), c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    use core_arch::simd::i8x8;
+    let r = vqtbx2_s8(
+        a,
+        int8x16x2_t(vcombine_s8(b.0, b.1), vcombine_s8(b.2, ::mem::zeroed())),
+        ::mem::transmute(c),
+    );
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(i8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    use core_arch::simd::u8x8;
+    let r = vqtbx2_u8(
+        a,
+        uint8x16x2_t(vcombine_u8(b.0, b.1), vcombine_u8(b.2, ::mem::zeroed())),
+        c,
+    );
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(u8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    use core_arch::simd::u8x8;
+    let r = vqtbx2_p8(
+        a,
+        poly8x16x2_t(vcombine_p8(b.0, b.1), vcombine_p8(b.2, ::mem::zeroed())),
+        c,
+    );
+    let m: int8x8_t = simd_lt(c, ::mem::transmute(u8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vqtbx2_s8(
+        a,
+        int8x16x2_t(vcombine_s8(b.0, b.1), vcombine_s8(b.2, b.3)),
+        ::mem::transmute(c),
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    vqtbx2_u8(
+        a,
+        uint8x16x2_t(vcombine_u8(b.0, b.1), vcombine_u8(b.2, b.3)),
+        c,
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    vqtbx2_p8(
+        a,
+        poly8x16x2_t(vcombine_p8(b.0, b.1), vcombine_p8(b.2, b.3)),
+        c,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1_s8(t: int8x16_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl1(t, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl1q(t, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1_u8(t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbl1(::mem::transmute(t), ::mem::transmute(idx)))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbl1q(::mem::transmute(t), ::mem::transmute(idx)))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1_p8(t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbl1(::mem::transmute(t), ::mem::transmute(idx)))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl1q_p8(t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbl1q(::mem::transmute(t), ::mem::transmute(idx)))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1_s8(a: int8x8_t, t: int8x16_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx1(a, t, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1q_s8(a: int8x16_t, t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx1q(a, t, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1_u8(a: uint8x8_t, t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbx1(
+        ::mem::transmute(a),
+        ::mem::transmute(t),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1q_u8(a: uint8x16_t, t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbx1q(
+        ::mem::transmute(a),
+        ::mem::transmute(t),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1_p8(a: poly8x8_t, t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbx1(
+        ::mem::transmute(a),
+        ::mem::transmute(t),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx1q_p8(a: poly8x16_t, t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbx1q(
+        ::mem::transmute(a),
+        ::mem::transmute(t),
+        ::mem::transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2_s8(t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl2(t.0, t.1, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2q_s8(t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl2q(t.0, t.1, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2_u8(t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbl2(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2q_u8(t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbl2q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2_p8(t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbl2(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl2q_p8(t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbl2q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2_s8(a: int8x8_t, t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx2(a, t.0, t.1, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2q_s8(a: int8x16_t, t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx2q(a, t.0, t.1, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2_u8(a: uint8x8_t, t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbx2(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2q_u8(a: uint8x16_t, t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbx2q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2_p8(a: poly8x8_t, t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbx2(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx2q_p8(a: poly8x16_t, t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbx2q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3_s8(t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl3(t.0, t.1, t.2, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3q_s8(t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl3q(t.0, t.1, t.2, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3_u8(t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbl3(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3q_u8(t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbl3q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3_p8(t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbl3(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl3q_p8(t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbl3q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3_s8(a: int8x8_t, t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx3(a, t.0, t.1, t.2, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3q_s8(a: int8x16_t, t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx3q(a, t.0, t.1, t.2, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3_u8(a: uint8x8_t, t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbx3(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3q_u8(a: uint8x16_t, t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbx3q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3_p8(a: poly8x8_t, t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbx3(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx3q_p8(a: poly8x16_t, t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbx3q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4_s8(t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl4(t.0, t.1, t.2, t.3, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4q_s8(t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl4q(t.0, t.1, t.2, t.3, idx)
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4_u8(t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbl4(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4q_u8(t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbl4q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4_p8(t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbl4(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+pub unsafe fn vqtbl4q_p8(t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbl4q(
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4_s8(a: int8x8_t, t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx4(a, t.0, t.1, t.2, t.3, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4q_s8(a: int8x16_t, t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx4q(a, t.0, t.1, t.2, t.3, idx)
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4_u8(a: uint8x8_t, t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vqtbx4(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4q_u8(a: uint8x16_t, t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t {
+    ::mem::transmute(vqtbx4q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4_p8(a: poly8x8_t, t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vqtbx4(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+pub unsafe fn vqtbx4q_p8(a: poly8x16_t, t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t {
+    ::mem::transmute(vqtbx4q(
+        ::mem::transmute(a),
+        ::mem::transmute(t.0),
+        ::mem::transmute(t.1),
+        ::mem::transmute(t.2),
+        ::mem::transmute(t.3),
+        ::mem::transmute(idx),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::aarch64::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f64() {
+        let a = 1.;
+        let b = 8.;
+        let e = 9.;
+        let r: f64 = mem::transmute(vadd_f64(mem::transmute(a), mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f64() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(8., 7.);
+        let e = f64x2::new(9., 9.);
+        let r: f64x2 = ::mem::transmute(vaddq_f64(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 = mem::transmute(vaddd_s64(mem::transmute(a), mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 = mem::transmute(vaddd_u64(mem::transmute(a), mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s8() {
+        let r = vmaxv_s8(::mem::transmute(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)));
+        assert_eq!(r, 7_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s8() {
+        #[rustfmt::skip]
+        let r = vmaxvq_s8(::mem::transmute(i8x16::new(
+            1, 2, 3, 4,
+            -16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 8_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s16() {
+        let r = vmaxv_s16(::mem::transmute(i16x4::new(1, 2, -4, 3)));
+        assert_eq!(r, 3_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s16() {
+        let r = vmaxvq_s16(::mem::transmute(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)));
+        assert_eq!(r, 7_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s32() {
+        let r = vmaxv_s32(::mem::transmute(i32x2::new(1, -4)));
+        assert_eq!(r, 1_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s32() {
+        let r = vmaxvq_s32(::mem::transmute(i32x4::new(1, 2, -32, 4)));
+        assert_eq!(r, 4_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u8() {
+        let r = vmaxv_u8(::mem::transmute(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)));
+        assert_eq!(r, 8_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u8() {
+        #[rustfmt::skip]
+        let r = vmaxvq_u8(::mem::transmute(u8x16::new(
+            1, 2, 3, 4,
+            16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 16_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u16() {
+        let r = vmaxv_u16(::mem::transmute(u16x4::new(1, 2, 4, 3)));
+        assert_eq!(r, 4_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u16() {
+        let r = vmaxvq_u16(::mem::transmute(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)));
+        assert_eq!(r, 16_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u32() {
+        let r = vmaxv_u32(::mem::transmute(u32x2::new(1, 4)));
+        assert_eq!(r, 4_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u32() {
+        let r = vmaxvq_u32(::mem::transmute(u32x4::new(1, 2, 32, 4)));
+        assert_eq!(r, 32_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_f32() {
+        let r = vmaxv_f32(::mem::transmute(f32x2::new(1., 4.)));
+        assert_eq!(r, 4_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_f32() {
+        let r = vmaxvq_f32(::mem::transmute(f32x4::new(1., 2., 32., 4.)));
+        assert_eq!(r, 32_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_f64() {
+        let r = vmaxvq_f64(::mem::transmute(f64x2::new(1., 4.)));
+        assert_eq!(r, 4_f64);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s8() {
+        let r = vminv_s8(::mem::transmute(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)));
+        assert_eq!(r, -8_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s8() {
+        #[rustfmt::skip]
+        let r = vminvq_s8(::mem::transmute(i8x16::new(
+            1, 2, 3, 4,
+            -16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, -16_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s16() {
+        let r = vminv_s16(::mem::transmute(i16x4::new(1, 2, -4, 3)));
+        assert_eq!(r, -4_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s16() {
+        let r = vminvq_s16(::mem::transmute(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)));
+        assert_eq!(r, -16_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s32() {
+        let r = vminv_s32(::mem::transmute(i32x2::new(1, -4)));
+        assert_eq!(r, -4_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s32() {
+        let r = vminvq_s32(::mem::transmute(i32x4::new(1, 2, -32, 4)));
+        assert_eq!(r, -32_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u8() {
+        let r = vminv_u8(::mem::transmute(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)));
+        assert_eq!(r, 1_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u8() {
+        #[rustfmt::skip]
+        let r = vminvq_u8(::mem::transmute(u8x16::new(
+            1, 2, 3, 4,
+            16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 1_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u16() {
+        let r = vminv_u16(::mem::transmute(u16x4::new(1, 2, 4, 3)));
+        assert_eq!(r, 1_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u16() {
+        let r = vminvq_u16(::mem::transmute(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)));
+        assert_eq!(r, 1_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u32() {
+        let r = vminv_u32(::mem::transmute(u32x2::new(1, 4)));
+        assert_eq!(r, 1_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u32() {
+        let r = vminvq_u32(::mem::transmute(u32x4::new(1, 2, 32, 4)));
+        assert_eq!(r, 1_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_f32() {
+        let r = vminv_f32(::mem::transmute(f32x2::new(1., 4.)));
+        assert_eq!(r, 1_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_f32() {
+        let r = vminvq_f32(::mem::transmute(f32x4::new(1., 2., 32., 4.)));
+        assert_eq!(r, 1_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_f64() {
+        let r = vminvq_f64(::mem::transmute(f64x2::new(1., 4.)));
+        assert_eq!(r, 1_f64);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
+        let r: i8x16 = ::mem::transmute(vpminq_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s16() {
+        let a = i16x8::new(1, -2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i16x8::new(-2, 3, 5, 7, 0, 2, 4, 6);
+        let r: i16x8 = ::mem::transmute(vpminq_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s32() {
+        let a = i32x4::new(1, -2, 3, 4);
+        let b = i32x4::new(0, 3, 2, 5);
+        let e = i32x4::new(-2, 3, 0, 2);
+        let r: i32x4 = ::mem::transmute(vpminq_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
+        let r: u8x16 = ::mem::transmute(vpminq_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u16x8::new(1, 3, 5, 7, 0, 2, 4, 6);
+        let r: u16x8 = ::mem::transmute(vpminq_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(0, 3, 2, 5);
+        let e = u32x4::new(1, 3, 0, 2);
+        let r: u32x4 = ::mem::transmute(vpminq_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f32() {
+        let a = f32x4::new(1., -2., 3., 4.);
+        let b = f32x4::new(0., 3., 2., 5.);
+        let e = f32x4::new(-2., 3., 0., 2.);
+        let r: f32x4 = ::mem::transmute(vpminq_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f64() {
+        let a = f64x2::new(1., -2.);
+        let b = f64x2::new(0., 3.);
+        let e = f64x2::new(-2., 0.);
+        let r: f64x2 = ::mem::transmute(vpminq_f64(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
+        let r: i8x16 = ::mem::transmute(vpmaxq_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s16() {
+        let a = i16x8::new(1, -2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i16x8::new(1, 4, 6, 8, 3, 5, 7, 9);
+        let r: i16x8 = ::mem::transmute(vpmaxq_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s32() {
+        let a = i32x4::new(1, -2, 3, 4);
+        let b = i32x4::new(0, 3, 2, 5);
+        let e = i32x4::new(1, 4, 3, 5);
+        let r: i32x4 = ::mem::transmute(vpmaxq_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
+        let r: u8x16 = ::mem::transmute(vpmaxq_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u16x8::new(2, 4, 6, 8, 3, 5, 7, 9);
+        let r: u16x8 = ::mem::transmute(vpmaxq_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(0, 3, 2, 5);
+        let e = u32x4::new(2, 4, 3, 5);
+        let r: u32x4 = ::mem::transmute(vpmaxq_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f32() {
+        let a = f32x4::new(1., -2., 3., 4.);
+        let b = f32x4::new(0., 3., 2., 5.);
+        let e = f32x4::new(1., 4., 3., 5.);
+        let r: f32x4 = ::mem::transmute(vpmaxq_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f64() {
+        let a = f64x2::new(1., -2.);
+        let b = f64x2::new(0., 3.);
+        let e = f64x2::new(1., 3.);
+        let r: f64x2 = ::mem::transmute(vpmaxq_f64(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    macro_rules! test_vcombine {
+        ($test_id:ident => $fn_id:ident ([$($a:expr),*], [$($b:expr),*])) => {
+            #[allow(unused_assignments)]
+            #[simd_test(enable = "neon")]
+            unsafe fn $test_id() {
+                let a = [$($a),*];
+                let b = [$($b),*];
+                let e = [$($a),* $(, $b)*];
+                let c = $fn_id(::mem::transmute(a), ::mem::transmute(b));
+                let mut d = e;
+                d = ::mem::transmute(c);
+                assert_eq!(d, e);
+            }
+        }
+    }
+
+    test_vcombine!(test_vcombine_s8 => vcombine_s8([3_i8, -4, 5, -6, 7, 8, 9, 10], [13_i8, -14, 15, -16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_u8 => vcombine_u8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_p8 => vcombine_p8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+
+    test_vcombine!(test_vcombine_s16 => vcombine_s16([3_i16, -4, 5, -6], [13_i16, -14, 15, -16]));
+    test_vcombine!(test_vcombine_u16 => vcombine_u16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    test_vcombine!(test_vcombine_p16 => vcombine_p16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    // FIXME: 16-bit floats
+    // test_vcombine!(test_vcombine_f16 => vcombine_f16([3_f16, 4., 5., 6.],
+    // [13_f16, 14., 15., 16.]));
+
+    test_vcombine!(test_vcombine_s32 => vcombine_s32([3_i32, -4], [13_i32, -14]));
+    test_vcombine!(test_vcombine_u32 => vcombine_u32([3_u32, 4], [13_u32, 14]));
+    // note: poly32x4 does not exist, and neither does vcombine_p32
+    test_vcombine!(test_vcombine_f32 => vcombine_f32([3_f32, -4.], [13_f32, -14.]));
+
+    test_vcombine!(test_vcombine_s64 => vcombine_s64([-3_i64], [13_i64]));
+    test_vcombine!(test_vcombine_u64 => vcombine_u64([3_u64], [13_u64]));
+    test_vcombine!(test_vcombine_p64 => vcombine_p64([3_u64], [13_u64]));
+    test_vcombine!(test_vcombine_f64 => vcombine_f64([-3_f64], [13_f64]));
+
+}
+
+#[cfg(test)]
+#[cfg(target_endian = "little")]
+#[path = "../arm/table_lookup_tests.rs"]
+mod table_lookup_tests;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/v8.rs b/library/stdarch/crates/core_arch/src/aarch64/v8.rs
new file mode 100644
index 00000000000..fdc9e78aac4
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/v8.rs
@@ -0,0 +1,105 @@
+//! ARMv8 intrinsics.
+//!
+//! The reference is [ARMv8-A Reference Manual][armv8].
+//!
+//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.
+//! ddi0487a.k_10775/index.html
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u64(x: u64) -> u64 {
+    x.swap_bytes() as u64
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(test, assert_instr(clz))]
+pub unsafe fn _clz_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+/// Reverse the bit order.
+#[inline]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn _rbit_u64(x: u64) -> u64 {
+    use intrinsics::bitreverse;
+    bitreverse(x)
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline]
+#[cfg_attr(test, assert_instr(cls))]
+pub unsafe fn _cls_u32(x: u32) -> u32 {
+    u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline]
+#[cfg_attr(test, assert_instr(cls))]
+pub unsafe fn _cls_u64(x: u64) -> u64 {
+    u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::aarch64::v8;
+
+    #[test]
+    fn _rev_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64),
+                0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
+            );
+        }
+    }
+
+    #[test]
+    fn _clz_u64() {
+        unsafe {
+            assert_eq!(v8::_clz_u64(0b0000_1010u64), 60u64);
+        }
+    }
+
+    #[test]
+    fn _rbit_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64),
+                0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
+            );
+        }
+    }
+
+    #[test]
+    fn _cls_u32() {
+        unsafe {
+            assert_eq!(
+                v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32),
+                15_u32
+            );
+        }
+    }
+
+    #[test]
+    fn _cls_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_cls_u64(
+                    0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
+                ),
+                15_u64
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/armclang.rs b/library/stdarch/crates/core_arch/src/arm/armclang.rs
new file mode 100644
index 00000000000..36a3a2fe9a3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/armclang.rs
@@ -0,0 +1,68 @@
+//! ARM compiler specific intrinsics
+//!
+//! # References
+//!
+//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
+//!
+//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Inserts a breakpoint instruction.
+///
+/// `val` is a compile-time constant integer in range `[0, 255]`.
+///
+/// The breakpoint instruction inserted is:
+///
+/// * `BKPT` when compiling as T32,
+/// * `BRK` when compiling as A32 or A64.
+///
+/// # Safety
+///
+/// If `val` is out-of-range the behavior is **undefined**.
+///
+/// # Note
+///
+/// [ARM's documentation][arm_docs] defines that `__breakpoint` accepts the
+/// following values for `val`:
+///
+/// - `0...65535` when compiling as A32 or A64,
+/// - `0...255` when compiling as T32.
+///
+/// The current implementation only accepts values in range `[0, 255]` - if the
+/// value is out-of-range the behavior is **undefined**.
+///
+/// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(bkpt, val = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(brk, val = 0))]
+#[inline(always)]
+#[rustc_args_required_const(0)]
+pub unsafe fn __breakpoint(val: i32) {
+    // Ensure that this compiles correctly on non-arm architectures, so libstd
+    // doc builds work. The proper macro will shadow this definition below.
+    #[allow(unused_macros)]
+    macro_rules! call {
+        ($e:expr) => {
+            ()
+        };
+    }
+
+    #[cfg(target_arch = "arm")]
+    macro_rules! call {
+        ($imm8:expr) => {
+            asm!(concat!("BKPT ", stringify!($imm8)) : : : : "volatile")
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    macro_rules! call {
+        ($imm8:expr) => {
+            asm!(concat!("BRK ", stringify!($imm8)) : : : : "volatile")
+        }
+    }
+
+    // We can't `panic!` inside this intrinsic, so we can't really validate the
+    // arguments here. If `val` is out-of-range this macro uses `val == 255`:
+    constify_imm8!(val, call);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/cmsis.rs b/library/stdarch/crates/core_arch/src/arm/cmsis.rs
new file mode 100644
index 00000000000..bc8509d3e8e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/cmsis.rs
@@ -0,0 +1,330 @@
+//! CMSIS: Cortex Microcontroller Software Interface Standard
+//!
+//! The version 5 of the standard can be found at:
+//!
+//! http://arm-software.github.io/CMSIS_5/Core/html/index.html
+//!
+//! The API reference of the standard can be found at:
+//!
+//! - Core function access -- http://arm-software.github.io/CMSIS_5/Core/html/group__Core__Register__gr.html
+//! - Intrinsic functions for CPU instructions -- http://arm-software.github.io/CMSIS_5/Core/html/group__intrinsic__CPU__gr.html
+//!
+//! The reference C implementation used as the base of this Rust port can be
+//! found at
+//!
+//! https://github.com/ARM-software/CMSIS_5/blob/5.3.0/CMSIS/Core/Include/cmsis_gcc.h
+
+#![allow(non_snake_case)]
+
+/* Core function access */
+
+/// Enable IRQ Interrupts
+///
+/// Enables IRQ interrupts by clearing the I-bit in the CPSR. Can only be
+/// executed in Privileged modes.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(cpsie))]
+pub unsafe fn __enable_irq() {
+    asm!("cpsie i" : : : "memory" : "volatile");
+}
+
+/// Disable IRQ Interrupts
+///
+/// Disables IRQ interrupts by setting the I-bit in the CPSR. Can only be
+/// executed in Privileged modes.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(cpsid))]
+pub unsafe fn __disable_irq() {
+    asm!("cpsid i" : : : "memory" : "volatile");
+}
+
+/// Get Control Register
+///
+/// Returns the content of the Control Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_CONTROL() -> u32 {
+    let result: u32;
+    asm!("mrs $0, CONTROL" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Set Control Register
+///
+/// Writes the given value to the Control Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(msr))]
+pub unsafe fn __set_CONTROL(control: u32) {
+    asm!("msr CONTROL, $0" : : "r"(control) : "memory" : "volatile");
+}
+
+/// Get IPSR Register
+///
+/// Returns the content of the IPSR Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_IPSR() -> u32 {
+    let result: u32;
+    asm!("mrs $0, IPSR" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Get APSR Register
+///
+/// Returns the content of the APSR Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_APSR() -> u32 {
+    let result: u32;
+    asm!("mrs $0, APSR" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Get xPSR Register
+///
+/// Returns the content of the xPSR Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_xPSR() -> u32 {
+    let result: u32;
+    asm!("mrs $0, XPSR" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Get Process Stack Pointer
+///
+/// Returns the current value of the Process Stack Pointer (PSP).
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_PSP() -> u32 {
+    let result: u32;
+    asm!("mrs $0, PSP" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Set Process Stack Pointer
+///
+/// Assigns the given value to the Process Stack Pointer (PSP).
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(msr))]
+pub unsafe fn __set_PSP(top_of_proc_stack: u32) {
+    asm!("msr PSP, $0" : : "r"(top_of_proc_stack) : : "volatile");
+}
+
+/// Get Main Stack Pointer
+///
+/// Returns the current value of the Main Stack Pointer (MSP).
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_MSP() -> u32 {
+    let result: u32;
+    asm!("mrs $0, MSP" : "=r"(result) : : : "volatile");
+    result
+}
+
+/// Set Main Stack Pointer
+///
+/// Assigns the given value to the Main Stack Pointer (MSP).
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(msr))]
+pub unsafe fn __set_MSP(top_of_main_stack: u32) {
+    asm!("msr MSP, $0" : : "r"(top_of_main_stack) : : "volatile");
+}
+
+/// Get Priority Mask
+///
+/// Returns the current state of the priority mask bit from the Priority Mask
+/// Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(mrs))]
+pub unsafe fn __get_PRIMASK() -> u32 {
+    let result: u32;
+    asm!("mrs $0, PRIMASK" : "=r"(result) : : "memory" : "volatile");
+    result
+}
+
+/// Set Priority Mask
+///
+/// Assigns the given value to the Priority Mask Register.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(msr))]
+pub unsafe fn __set_PRIMASK(pri_mask: u32) {
+    asm!("msr PRIMASK, $0" : : "r"(pri_mask) : : "volatile");
+}
+
+#[cfg(any(target_feature = "v7", dox))]
+mod v7 {
+    /// Enable FIQ
+    ///
+    /// Enables FIQ interrupts by clearing the F-bit in the CPSR. Can only be
+    /// executed in Privileged modes.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(cpsie))]
+    pub unsafe fn __enable_fault_irq() {
+        asm!("cpsie f" : : : "memory" : "volatile");
+    }
+
+    /// Disable FIQ
+    ///
+    /// Disables FIQ interrupts by setting the F-bit in the CPSR. Can only be
+    /// executed in Privileged modes.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(cpsid))]
+    pub unsafe fn __disable_fault_irq() {
+        asm!("cpsid f" : : : "memory" : "volatile");
+    }
+
+    /// Get Base Priority
+    ///
+    /// Returns the current value of the Base Priority register.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(mrs))]
+    pub unsafe fn __get_BASEPRI() -> u32 {
+        let result: u32;
+        asm!("mrs $0, BASEPRI" : "=r"(result) : : : "volatile");
+        result
+    }
+
+    /// Set Base Priority
+    ///
+    /// Assigns the given value to the Base Priority register.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(msr))]
+    pub unsafe fn __set_BASEPRI(base_pri: u32) {
+        asm!("msr BASEPRI, $0" : : "r"(base_pri) : "memory" : "volatile");
+    }
+
+    /// Set Base Priority with condition
+    ///
+    /// Assigns the given value to the Base Priority register only if BASEPRI
+    /// masking is disabled, or the new value increases the BASEPRI
+    /// priority level.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(mrs))]
+    pub unsafe fn __set_BASEPRI_MAX(base_pri: u32) {
+        asm!("msr BASEPRI_MAX, $0" : : "r"(base_pri) : "memory" : "volatile");
+    }
+
+    /// Get Fault Mask
+    ///
+    /// Returns the current value of the Fault Mask register.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(mrs))]
+    pub unsafe fn __get_FAULTMASK() -> u32 {
+        let result: u32;
+        asm!("mrs $0, FAULTMASK" : "=r"(result) : : : "volatile");
+        result
+    }
+
+    /// Set Fault Mask
+    ///
+    /// Assigns the given value to the Fault Mask register.
+    #[inline]
+    #[target_feature(enable = "mclass")]
+    #[cfg_attr(test, assert_instr(msr))]
+    pub unsafe fn __set_FAULTMASK(fault_mask: u32) {
+        asm!("msr FAULTMASK, $0" : : "r"(fault_mask) : "memory" : "volatile");
+    }
+}
+
+#[cfg(any(target_feature = "v7", dox))]
+pub use self::v7::*;
+
+/* Core instruction access */
+
+/// No Operation
+///
+/// No Operation does nothing. This instruction can be used for code alignment
+/// purposes.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn __NOP() {
+    asm!("nop" : : : : "volatile");
+}
+
+/// Wait For Interrupt
+///
+/// Wait For Interrupt is a hint instruction that suspends execution until one
+/// of a number of events occurs.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(wfi))]
+pub unsafe fn __WFI() {
+    asm!("wfi" : : : : "volatile");
+}
+
+/// Wait For Event
+///
+/// Wait For Event is a hint instruction that permits the processor to enter a
+/// low-power state until one of a number of events occurs.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(wfe))]
+pub unsafe fn __WFE() {
+    asm!("wfe" : : : : "volatile");
+}
+
+/// Send Event
+///
+/// Send Event is a hint instruction. It causes an event to be signaled to the
+/// CPU.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(sev))]
+pub unsafe fn __SEV() {
+    asm!("sev" : : : : "volatile");
+}
+
+/// Instruction Synchronization Barrier
+///
+/// Instruction Synchronization Barrier flushes the pipeline in the processor,
+/// so that all instructions following the ISB are fetched from cache or
+/// memory, after the instruction has been completed.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(isb))]
+pub unsafe fn __ISB() {
+    asm!("isb 0xF" : : : "memory" : "volatile");
+}
+
+/// Data Synchronization Barrier
+///
+/// Acts as a special kind of Data Memory Barrier. It completes when all
+/// explicit memory accesses before this instruction complete.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(dsb))]
+pub unsafe fn __DSB() {
+    asm!("dsb 0xF" : : : "memory" : "volatile");
+}
+
+/// Data Memory Barrier
+///
+/// Ensures the apparent order of the explicit memory operations before and
+/// after the instruction, without ensuring their completion.
+#[inline]
+#[target_feature(enable = "mclass")]
+#[cfg_attr(test, assert_instr(dmb))]
+pub unsafe fn __DMB() {
+    asm!("dmb 0xF" : : : "memory" : "volatile");
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/dsp.rs b/library/stdarch/crates/core_arch/src/arm/dsp.rs
new file mode 100644
index 00000000000..8385e7ed218
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs
@@ -0,0 +1,654 @@
+//! ARM DSP Intrinsics.
+//!
+//! Based on "Arm C Language Extensions (ACLE) Version Q2 2018"
+//!
+//! https://developer.arm.com/products/software-development-tools/compilers/arm-compiler-5/docs/101028/0006
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+types! {
+    /// ARM-specific 32-bit wide vector of four packed `i8`.
+    pub struct int8x4_t(i8, i8, i8, i8);
+    /// ARM-specific 32-bit wide vector of four packed `u8`.
+    pub struct uint8x4_t(u8, u8, u8, u8);
+    /// ARM-specific 32-bit wide vector of two packed `i16`.
+    pub struct int16x2_t(i16, i16);
+    /// ARM-specific 32-bit wide vector of two packed `u16`.
+    pub struct uint16x2_t(u16, u16);
+}
+
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        ::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
+    };
+}
+
+extern "C" {
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlad"]
+    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlsd"]
+    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.sel"]
+    fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usad8"]
+    fn arm_usad8(a: i32, b: i32) -> u32;
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qadd8, a, b)
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+/// res\[2\] = a\[2\] - b\[2\]
+/// res\[3\] = a\[3\] - b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qsub8, a, b)
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsub16, a, b)
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] - b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Dual 16-bit Signed Multiply with Addition of products
+/// and 32-bit accumulation.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlad))]
+pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlad(::mem::transmute(a), ::mem::transmute(b), c)
+}
+
+/// Dual 16-bit Signed Multiply with Subtraction  of products
+/// and 32-bit accumulation and overflow detection.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlsd))]
+pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlsd(::mem::transmute(a), ::mem::transmute(b), c)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Select bytes from each operand according to APSR GE flags
+///
+/// Returns the equivalent of
+///
+/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
+/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
+/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
+/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+#[cfg(all(not(target_feature = "mclass")))]
+pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
+}
+
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuad))]
+pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Sum of 8-bit absolute differences.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+    arm_usad8(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Sum of 8-bit absolute differences and constant.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn usad8a(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    usad8(a, b) + c
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(dsp::qadd(-10, 60), 50);
+            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(dsp::qsub(10, 60), -50);
+            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
+            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, ::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, ::std::i16::MAX - 2);
+            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smlad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = dsp::smlad(::mem::transmute(a), ::mem::transmute(b), 10);
+            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
+        }
+    }
+
+    #[test]
+    fn smlsd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = dsp::smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
+            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
+            let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sel, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
+
+    #[test]
+    fn usad8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let r = dsp::usad8(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 8);
+        }
+    }
+
+    #[test]
+    fn usad8a() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let c = 10;
+            let r = dsp::usad8a(::mem::transmute(a), ::mem::transmute(b), c);
+            assert_eq!(r, 8 + c);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs
new file mode 100644
index 00000000000..30ff991f8d9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@@ -0,0 +1,56 @@
+//! ARM intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+#![allow(non_camel_case_types)]
+
+mod armclang;
+
+pub use self::armclang::*;
+
+#[cfg(any(target_feature = "mclass", dox))]
+mod cmsis;
+#[cfg(any(target_feature = "mclass", dox))]
+pub use self::cmsis::*;
+
+mod v6;
+pub use self::v6::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
+mod v7;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
+pub use self::v7::*;
+
+#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
+mod dsp;
+#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
+pub use self::dsp::*;
+
+// NEON is supported on AArch64, and on ARM when built with the v7 and neon
+// features. Building ARM without neon produces incorrect codegen.
+#[cfg(any(
+    target_arch = "aarch64",
+    all(target_feature = "v7", target_feature = "neon"),
+    dox
+))]
+mod neon;
+#[cfg(any(
+    target_arch = "aarch64",
+    all(target_feature = "v7", target_feature = "neon"),
+    dox
+))]
+pub use self::neon::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Generates the trap instruction `UDF`
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(udf))]
+#[inline]
+pub unsafe fn udf() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs
new file mode 100644
index 00000000000..799f2a14ee3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@@ -0,0 +1,1420 @@
+//! ARMv7 NEON intrinsics
+
+use core_arch::simd_llvm::*;
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+types! {
+    /// ARM-specific 64-bit wide vector of eight packed `i8`.
+    pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
+    /// ARM-specific 64-bit wide vector of eight packed `u8`.
+    pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`.
+    pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide vector of four packed `i16`.
+    pub struct int16x4_t(i16, i16, i16, i16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct uint16x4_t(u16, u16, u16, u16);
+    // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
+    // pub struct float16x4_t(f16, f16, f16, f16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct poly16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of two packed `i32`.
+    pub struct int32x2_t(i32, i32);
+    /// ARM-specific 64-bit wide vector of two packed `u32`.
+    pub struct uint32x2_t(u32, u32);
+    /// ARM-specific 64-bit wide vector of two packed `f32`.
+    pub struct float32x2_t(f32, f32);
+    /// ARM-specific 64-bit wide vector of one packed `i64`.
+    pub struct int64x1_t(i64);
+    /// ARM-specific 64-bit wide vector of one packed `u64`.
+    pub struct uint64x1_t(u64);
+
+    /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
+    pub struct int8x16_t(
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct uint8x16_t(
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct poly8x16_t(
+        u8, u8, u8, u8, u8, u8, u8, u8,
+        u8, u8, u8, u8, u8, u8, u8, u8
+    );
+    /// ARM-specific 128-bit wide vector of eight packed `i16`.
+    pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
+    // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of four packed `i32`.
+    pub struct int32x4_t(i32, i32, i32, i32);
+    /// ARM-specific 128-bit wide vector of four packed `u32`.
+    pub struct uint32x4_t(u32, u32, u32, u32);
+    /// ARM-specific 128-bit wide vector of four packed `f32`.
+    pub struct float32x4_t(f32, f32, f32, f32);
+    /// ARM-specific 128-bit wide vector of two packed `i64`.
+    pub struct int64x2_t(i64, i64);
+    /// ARM-specific 128-bit wide vector of two packed `u64`.
+    pub struct uint64x2_t(u64, u64);
+}
+
+/// ARM-specific type containing two `int8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x8x2_t(pub int8x8_t, pub int8x8_t);
+/// ARM-specific type containing three `int8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x8x3_t(pub int8x8_t, pub int8x8_t, pub int8x8_t);
+/// ARM-specific type containing four `int8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x8x4_t(pub int8x8_t, pub int8x8_t, pub int8x8_t, pub int8x8_t);
+
+/// ARM-specific type containing two `uint8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x8x2_t(pub uint8x8_t, pub uint8x8_t);
+/// ARM-specific type containing three `uint8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x8x3_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+/// ARM-specific type containing four `uint8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x8x4_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+
+/// ARM-specific type containing two `poly8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x8x2_t(pub poly8x8_t, pub poly8x8_t);
+/// ARM-specific type containing three `poly8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x8x3_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+/// ARM-specific type containing four `poly8x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x8x4_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+    fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")]
+    fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")]
+    fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")]
+    fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")]
+    fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")]
+    fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")]
+    fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")]
+    fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")]
+    fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")]
+    fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")]
+    fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")]
+    fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")]
+    fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")]
+    fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
+    fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+}
+
+#[cfg(target_arch = "arm")]
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.arm.neon.vtbl1"]
+    fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl2"]
+    fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl3"]
+    fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl4"]
+    fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vtbx1"]
+    fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx2"]
+    fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx3"]
+    fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx4"]
+    fn vtbx4(
+        a: int8x8_t,
+        b: int8x8_t,
+        b: int8x8_t,
+        c: int8x8_t,
+        d: int8x8_t,
+        e: int8x8_t,
+    ) -> int8x8_t;
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+pub unsafe fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+pub unsafe fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector long add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
+    simd_cast(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    frsqrte_v2f32(a)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+pub unsafe fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vpmins_v8i8(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+pub unsafe fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    vpmins_v4i16(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+pub unsafe fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    vpmins_v2i32(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+pub unsafe fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vpminu_v8i8(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+pub unsafe fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    vpminu_v4i16(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+pub unsafe fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    vpminu_v2i32(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminp))]
+pub unsafe fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    vpminf_v2f32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+pub unsafe fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vpmaxs_v8i8(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+pub unsafe fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    vpmaxs_v4i16(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+pub unsafe fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    vpmaxs_v2i32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+pub unsafe fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vpmaxu_v8i8(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+pub unsafe fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    vpmaxu_v4i16(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+pub unsafe fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    vpmaxu_v2i32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxp))]
+pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    vpmaxf_v2f32(a, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vtbl1(a, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbl1(::mem::transmute(a), ::mem::transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbl1(::mem::transmute(a), ::mem::transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vtbl2(a.0, a.1, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbl2(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbl2(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vtbl3(a.0, a.1, a.2, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbl3(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(a.2),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbl3(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(a.2),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vtbl4(a.0, a.1, a.2, a.3, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbl4(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(a.2),
+        ::mem::transmute(a.3),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbl4(
+        ::mem::transmute(a.0),
+        ::mem::transmute(a.1),
+        ::mem::transmute(a.2),
+        ::mem::transmute(a.3),
+        ::mem::transmute(b),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    vtbx1(a, b, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbx1(
+        ::mem::transmute(a),
+        ::mem::transmute(b),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbx1(
+        ::mem::transmute(a),
+        ::mem::transmute(b),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vtbx2(a, b.0, b.1, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbx2(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbx2(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    vtbx3(a, b.0, b.1, b.2, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbx3(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(b.2),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbx3(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(b.2),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vtbx4(a, b.0, b.1, b.2, b.3, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    ::mem::transmute(vtbx4(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(b.2),
+        ::mem::transmute(b.3),
+        ::mem::transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_arch = "arm")]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    ::mem::transmute(vtbx4(
+        ::mem::transmute(a),
+        ::mem::transmute(b.0),
+        ::mem::transmute(b.1),
+        ::mem::transmute(b.2),
+        ::mem::transmute(b.3),
+        ::mem::transmute(c),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let e = i8x8::new(9, 9, 9, 9, 9, 9, 9, 9);
+        let r: i8x8 = ::mem::transmute(vadd_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
+        let r: i8x16 = ::mem::transmute(vaddq_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let b = i16x4::new(8, 7, 6, 5);
+        let e = i16x4::new(9, 9, 9, 9);
+        let r: i16x4 = ::mem::transmute(vadd_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let e = i16x8::new(9, 9, 9, 9, 9, 9, 9, 9);
+        let r: i16x8 = ::mem::transmute(vaddq_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s32() {
+        let a = i32x2::new(1, 2);
+        let b = i32x2::new(8, 7);
+        let e = i32x2::new(9, 9);
+        let r: i32x2 = ::mem::transmute(vadd_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let b = i32x4::new(8, 7, 6, 5);
+        let e = i32x4::new(9, 9, 9, 9);
+        let r: i32x4 = ::mem::transmute(vaddq_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let e = u8x8::new(9, 9, 9, 9, 9, 9, 9, 9);
+        let r: u8x8 = ::mem::transmute(vadd_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
+        let r: u8x16 = ::mem::transmute(vaddq_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(8, 7, 6, 5);
+        let e = u16x4::new(9, 9, 9, 9);
+        let r: u16x4 = ::mem::transmute(vadd_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let e = u16x8::new(9, 9, 9, 9, 9, 9, 9, 9);
+        let r: u16x8 = ::mem::transmute(vaddq_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(8, 7);
+        let e = u32x2::new(9, 9);
+        let r: u32x2 = ::mem::transmute(vadd_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(8, 7, 6, 5);
+        let e = u32x4::new(9, 9, 9, 9);
+        let r: u32x4 = ::mem::transmute(vaddq_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f32() {
+        let a = f32x2::new(1., 2.);
+        let b = f32x2::new(8., 7.);
+        let e = f32x2::new(9., 9.);
+        let r: f32x2 = ::mem::transmute(vadd_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f32() {
+        let a = f32x4::new(1., 2., 3., 4.);
+        let b = f32x4::new(8., 7., 6., 5.);
+        let e = f32x4::new(9., 9., 9., 9.);
+        let r: f32x4 = ::mem::transmute(vaddq_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s8() {
+        let v = ::std::i8::MAX;
+        let a = i8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as i16);
+        let e = i16x8::new(v, v, v, v, v, v, v, v);
+        let r: i16x8 = ::mem::transmute(vaddl_s8(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s16() {
+        let v = ::std::i16::MAX;
+        let a = i16x4::new(v, v, v, v);
+        let v = 2 * (v as i32);
+        let e = i32x4::new(v, v, v, v);
+        let r: i32x4 = ::mem::transmute(vaddl_s16(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s32() {
+        let v = ::std::i32::MAX;
+        let a = i32x2::new(v, v);
+        let v = 2 * (v as i64);
+        let e = i64x2::new(v, v);
+        let r: i64x2 = ::mem::transmute(vaddl_s32(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u8() {
+        let v = ::std::u8::MAX;
+        let a = u8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as u16);
+        let e = u16x8::new(v, v, v, v, v, v, v, v);
+        let r: u16x8 = ::mem::transmute(vaddl_u8(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u16() {
+        let v = ::std::u16::MAX;
+        let a = u16x4::new(v, v, v, v);
+        let v = 2 * (v as u32);
+        let e = u32x4::new(v, v, v, v);
+        let r: u32x4 = ::mem::transmute(vaddl_u16(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u32() {
+        let v = ::std::u32::MAX;
+        let a = u32x2::new(v, v);
+        let v = 2 * (v as u64);
+        let e = u64x2::new(v, v);
+        let r: u64x2 = ::mem::transmute(vaddl_u32(::mem::transmute(a), ::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = ::mem::transmute(vmovn_s16(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = ::mem::transmute(vmovn_s32(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i32x2::new(1, 2);
+        let r: i32x2 = ::mem::transmute(vmovn_s64(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = ::mem::transmute(vmovn_u16(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = ::mem::transmute(vmovn_u32(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u32x2::new(1, 2);
+        let r: u32x2 = ::mem::transmute(vmovn_u64(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s8() {
+        let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = ::mem::transmute(vmovl_s8(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s16() {
+        let e = i32x4::new(1, 2, 3, 4);
+        let a = i16x4::new(1, 2, 3, 4);
+        let r: i32x4 = ::mem::transmute(vmovl_s16(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s32() {
+        let e = i64x2::new(1, 2);
+        let a = i32x2::new(1, 2);
+        let r: i64x2 = ::mem::transmute(vmovl_s32(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u8() {
+        let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = ::mem::transmute(vmovl_u8(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u16() {
+        let e = u32x4::new(1, 2, 3, 4);
+        let a = u16x4::new(1, 2, 3, 4);
+        let r: u32x4 = ::mem::transmute(vmovl_u16(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u32() {
+        let e = u64x2::new(1, 2);
+        let a = u32x2::new(1, 2);
+        let r: u64x2 = ::mem::transmute(vmovl_u32(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrt_f32() {
+        let a = f32x2::new(1.0, 2.0);
+        let e = f32x2::new(0.9980469, 0.7050781);
+        let r: f32x2 = ::mem::transmute(vrsqrte_f32(::mem::transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s8() {
+        let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
+        let b = i8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i8x8::new(-2, -4, 5, 7, 0, 2, 4, 6);
+        let r: i8x8 = ::mem::transmute(vpmin_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s16() {
+        let a = i16x4::new(1, 2, 3, -4);
+        let b = i16x4::new(0, 3, 2, 5);
+        let e = i16x4::new(1, -4, 0, 2);
+        let r: i16x4 = ::mem::transmute(vpmin_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s32() {
+        let a = i32x2::new(1, -2);
+        let b = i32x2::new(0, 3);
+        let e = i32x2::new(-2, 0);
+        let r: i32x2 = ::mem::transmute(vpmin_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u8x8::new(1, 3, 5, 7, 0, 2, 4, 6);
+        let r: u8x8 = ::mem::transmute(vpmin_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(0, 3, 2, 5);
+        let e = u16x4::new(1, 3, 0, 2);
+        let r: u16x4 = ::mem::transmute(vpmin_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(0, 3);
+        let e = u32x2::new(1, 0);
+        let r: u32x2 = ::mem::transmute(vpmin_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f32() {
+        let a = f32x2::new(1., -2.);
+        let b = f32x2::new(0., 3.);
+        let e = f32x2::new(-2., 0.);
+        let r: f32x2 = ::mem::transmute(vpmin_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s8() {
+        let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
+        let b = i8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i8x8::new(1, 3, 6, 8, 3, 5, 7, 9);
+        let r: i8x8 = ::mem::transmute(vpmax_s8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s16() {
+        let a = i16x4::new(1, 2, 3, -4);
+        let b = i16x4::new(0, 3, 2, 5);
+        let e = i16x4::new(2, 3, 3, 5);
+        let r: i16x4 = ::mem::transmute(vpmax_s16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s32() {
+        let a = i32x2::new(1, -2);
+        let b = i32x2::new(0, 3);
+        let e = i32x2::new(1, 3);
+        let r: i32x2 = ::mem::transmute(vpmax_s32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u8x8::new(2, 4, 6, 8, 3, 5, 7, 9);
+        let r: u8x8 = ::mem::transmute(vpmax_u8(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(0, 3, 2, 5);
+        let e = u16x4::new(2, 4, 3, 5);
+        let r: u16x4 = ::mem::transmute(vpmax_u16(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(0, 3);
+        let e = u32x2::new(2, 3);
+        let r: u32x2 = ::mem::transmute(vpmax_u32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f32() {
+        let a = f32x2::new(1., -2.);
+        let b = f32x2::new(0., 3.);
+        let e = f32x2::new(1., 3.);
+        let r: f32x2 = ::mem::transmute(vpmax_f32(::mem::transmute(a), ::mem::transmute(b)));
+        assert_eq!(r, e);
+    }
+}
+
+#[cfg(test)]
+#[cfg(target_endian = "little")]
+#[path = "table_lookup_tests.rs"]
+mod table_lookup_tests;
diff --git a/library/stdarch/crates/core_arch/src/arm/table_lookup_tests.rs b/library/stdarch/crates/core_arch/src/arm/table_lookup_tests.rs
new file mode 100644
index 00000000000..4d0c21ee01c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/table_lookup_tests.rs
@@ -0,0 +1,1042 @@
+//! Tests for ARM+v7+neon table lookup (vtbl, vtbx) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "aarch64")]
+use core_arch::aarch64::*;
+
+#[cfg(target_arch = "arm")]
+use core_arch::arm::*;
+
+use core_arch::simd::*;
+use std::mem;
+use stdsimd_test::simd_test;
+
+macro_rules! test_vtbl {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = ::mem::transmute([$($table_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = ::mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(table, ::mem::transmute(ctrl));
+                    let result: $ctrl_t = ::mem::transmute(result);
+                    let expected: $ctrl_t = ::mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+// ARM+v7+neon and AArch64+neon tests
+
+test_vtbl!(
+    test_vtbl1_s8 => vtbl1_s8:
+    - table[int8x8_t]: [0_i8, -11, 2, 3, 4, 5, 6, 7] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [3_i8, 4, -11, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, -9, 10, 2, 15, 5] => [3_i8, 0, -11, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_u8 => vtbl1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_p8 => vtbl1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl2_s8 => vtbl2_s8:
+    - table[int8x8x2_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_u8 => vtbl2_u8:
+    - table[uint8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_p8 => vtbl2_p8:
+    - table[poly8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl3_s8 => vtbl3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 21, 12] => [0_i8, -121, -17, 3, 34, -116, -5, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -27, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl3_u8 => vtbl3_u8:
+    - table[uint8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl3_p8 => vtbl3_p8:
+    - table[poly8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_s8 => vtbl4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7,
+        8, -9, 10, 11, 12, -13, 14, 15
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 25, 12] => [0_i8, -121, -17, 3, 34, -116, -9, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 32, 10, -33, 27, 7, 18] => [68_i8, -117, 0, -84, 0, 11, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl4_u8 => vtbl4_u8:
+    - table[uint8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_p8 => vtbl4_p8:
+    - table[poly8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+macro_rules! test_vtbx {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     - ext[$ext_t:ident]: [$($ext_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = ::mem::transmute([$($table_v),*]);
+            let ext: $ext_t = ::mem::transmute([$($ext_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = ::mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(ext, table, ::mem::transmute(ctrl));
+                    let result: $ctrl_t = ::mem::transmute(result);
+                    let expected: $ctrl_t = ::mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+test_vtbx!(
+    test_vtbx1_s8 => vtbx1_s8:
+    - table[int8x8_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [-3_i8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 9, 10, 2, -15, 5] => [-3_i8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_u8 => vtbx1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[uint8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_p8 => vtbx1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[poly8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx2_s8 => vtbx2_s8:
+    - table[int8x8x2_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7, 8, 9, -10, 11, 12, -13, 14, 15] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 10, 2, 7, 15] => [-3_i8, 4, 1, 6, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 10, 17, 2, 15, -19] => [-3_i8, 8, 1, -10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_u8 => vtbx2_u8:
+    - table[uint8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_p8 => vtbx2_p8:
+    - table[poly8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_s8 => vtbx3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 17, 22, 10, 2, 7, 15] => [-3_i8, 4, -17, 22, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -29] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_u8 => vtbx3_u8:
+    - table[uint8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_p8 => vtbx3_p8:
+    - table[poly8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_s8 => vtbx4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23,
+        -24, 25, 26, -27, 28, -29, 30, 31] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 31, 17, 22, 10, 29, 7, 15] => [-3_i8, 31, -17, 22, -10, -29, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -42] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_u8 => vtbx4_u8:
+    - table[uint8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_p8 => vtbx4_p8:
+    - table[poly8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+// Aarch64 tests
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_s8 => vqtbl1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_s8 => vqtbl1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_u8 => vqtbl1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_u8 => vqtbl1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_p8 => vqtbl1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_p8 => vqtbl1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_s8 => vqtbl2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_s8 => vqtbl2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_u8 => vqtbl2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_u8 => vqtbl2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_p8 => vqtbl2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_p8 => vqtbl2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_s8 => vqtbl3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_s8 => vqtbl3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_u8 => vqtbl3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_u8 => vqtbl3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_p8 => vqtbl3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_p8 => vqtbl3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_s8 => vqtbl4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_s8 => vqtbl4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_u8 => vqtbl4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_u8 => vqtbl4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_p8 => vqtbl4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_p8 => vqtbl4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_s8 => vqtbx1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [100_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 102, -84, 102, -105, 119, -107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_s8 => vqtbx1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 110, -84, 102, -113, 119, -115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_u8 => vqtbx1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_u8 => vqtbx1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_p8 => vqtbx1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_p8 => vqtbx1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_s8 => vqtbx2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 102, 10, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_s8 => vqtbx2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 110, 10, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_u8 => vqtbx2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_u8 => vqtbx2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_p8 => vqtbx2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_p8 => vqtbx2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_s8 => vqtbx3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, -103, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_s8 => vqtbx3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, -111, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_u8 => vqtbx3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_u8 => vqtbx3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_p8 => vqtbx3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_p8 => vqtbx3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_s8 => vqtbx4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 102, -51, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_s8 => vqtbx4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 110, -51, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_u8 => vqtbx4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_u8 => vqtbx4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_p8 => vqtbx4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_p8 => vqtbx4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
diff --git a/library/stdarch/crates/core_arch/src/arm/v6.rs b/library/stdarch/crates/core_arch/src/arm/v6.rs
new file mode 100644
index 00000000000..c24c40f963e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v6.rs
@@ -0,0 +1,49 @@
+//! ARMv6 intrinsics.
+//!
+//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
+//!
+//! [armv6m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
+//! html
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u16(x: u16) -> u16 {
+    x.swap_bytes() as u16
+}
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u32(x: u32) -> u32 {
+    x.swap_bytes() as u32
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::v6;
+
+    #[test]
+    fn _rev_u16() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u16(0b0000_0000_1111_1111_u16),
+                0b1111_1111_0000_0000_u16
+            );
+        }
+    }
+
+    #[test]
+    fn _rev_u32() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
+                0b1111_1111_0000_0000_1111_1111_0000_0000_u32
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/v7.rs b/library/stdarch/crates/core_arch/src/arm/v7.rs
new file mode 100644
index 00000000000..608907ce816
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v7.rs
@@ -0,0 +1,89 @@
+//! ARMv7 intrinsics.
+//!
+//! The reference is [ARMv7-M Architecture Reference Manual (Issue
+//! E.b)][armv7m].
+//!
+//! [armv7m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
+//! b/index.html
+
+pub use super::v6::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u8(x: u8) -> u8 {
+    x.leading_zeros() as u8
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u16(x: u16) -> u16 {
+    x.leading_zeros() as u16
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u32(x: u32) -> u32 {
+    x.leading_zeros() as u32
+}
+
+/// Reverse the bit order.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn _rbit_u32(x: u32) -> u32 {
+    use intrinsics::bitreverse;
+    bitreverse(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::v7;
+
+    #[test]
+    fn _clz_u8() {
+        unsafe {
+            assert_eq!(v7::_clz_u8(0b0000_1010u8), 4u8);
+        }
+    }
+
+    #[test]
+    fn _clz_u16() {
+        unsafe {
+            assert_eq!(v7::_clz_u16(0b0000_1010u16), 12u16);
+        }
+    }
+
+    #[test]
+    fn _clz_u32() {
+        unsafe {
+            assert_eq!(v7::_clz_u32(0b0000_1010u32), 28u32);
+        }
+    }
+
+    #[test]
+    #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
+    fn _rbit_u32() {
+        unsafe {
+            assert_eq!(
+                v7::_rbit_u32(0b0000_1010u32),
+                0b0101_0000_0000_0000_0000_0000_0000_0000u32
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/coresimd/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
index fdb4fb277dc..f179dbde8a9 100644
--- a/library/stdarch/crates/coresimd/src/lib.rs
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -1,10 +1,4 @@
-//! SIMD and vendor intrinsics support library.
-//!
-//! This documentation is for the `coresimd` crate, but you probably want to
-//! use the [`stdsimd` crate][stdsimd] which should have more complete
-//! documentation.
-//!
-//! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
+//! Architecture-specific intrinsics.
 
 #![cfg_attr(stdsimd_strict, deny(warnings))]
 #![allow(dead_code)]
@@ -85,7 +79,7 @@ extern crate core as _core;
 extern crate std;
 #[cfg(test)]
 #[macro_use]
-extern crate stdsimd;
+extern crate std_detect;
 #[cfg(test)]
 extern crate stdsimd_test;
 #[cfg(test)]
@@ -94,10 +88,10 @@ extern crate test;
 #[cfg(all(test, target_arch = "wasm32"))]
 extern crate wasm_bindgen_test;
 
-#[path = "../../../coresimd/mod.rs"]
-mod coresimd;
+#[path = "mod.rs"]
+mod core_arch;
 
-pub use coresimd::arch;
+pub use core_arch::arch;
 
 #[allow(unused_imports)]
 use _core::clone;
diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
new file mode 100644
index 00000000000..74a01be7709
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -0,0 +1,282 @@
+//! Utility macros.
+
+#[allow(unused)]
+macro_rules! constify_imm8 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            31 => $expand!(31),
+            32 => $expand!(32),
+            33 => $expand!(33),
+            34 => $expand!(34),
+            35 => $expand!(35),
+            36 => $expand!(36),
+            37 => $expand!(37),
+            38 => $expand!(38),
+            39 => $expand!(39),
+            40 => $expand!(40),
+            41 => $expand!(41),
+            42 => $expand!(42),
+            43 => $expand!(43),
+            44 => $expand!(44),
+            45 => $expand!(45),
+            46 => $expand!(46),
+            47 => $expand!(47),
+            48 => $expand!(48),
+            49 => $expand!(49),
+            50 => $expand!(50),
+            51 => $expand!(51),
+            52 => $expand!(52),
+            53 => $expand!(53),
+            54 => $expand!(54),
+            55 => $expand!(55),
+            56 => $expand!(56),
+            57 => $expand!(57),
+            58 => $expand!(58),
+            59 => $expand!(59),
+            60 => $expand!(60),
+            61 => $expand!(61),
+            62 => $expand!(62),
+            63 => $expand!(63),
+            64 => $expand!(64),
+            65 => $expand!(65),
+            66 => $expand!(66),
+            67 => $expand!(67),
+            68 => $expand!(68),
+            69 => $expand!(69),
+            70 => $expand!(70),
+            71 => $expand!(71),
+            72 => $expand!(72),
+            73 => $expand!(73),
+            74 => $expand!(74),
+            75 => $expand!(75),
+            76 => $expand!(76),
+            77 => $expand!(77),
+            78 => $expand!(78),
+            79 => $expand!(79),
+            80 => $expand!(80),
+            81 => $expand!(81),
+            82 => $expand!(82),
+            83 => $expand!(83),
+            84 => $expand!(84),
+            85 => $expand!(85),
+            86 => $expand!(86),
+            87 => $expand!(87),
+            88 => $expand!(88),
+            89 => $expand!(89),
+            90 => $expand!(90),
+            91 => $expand!(91),
+            92 => $expand!(92),
+            93 => $expand!(93),
+            94 => $expand!(94),
+            95 => $expand!(95),
+            96 => $expand!(96),
+            97 => $expand!(97),
+            98 => $expand!(98),
+            99 => $expand!(99),
+            100 => $expand!(100),
+            101 => $expand!(101),
+            102 => $expand!(102),
+            103 => $expand!(103),
+            104 => $expand!(104),
+            105 => $expand!(105),
+            106 => $expand!(106),
+            107 => $expand!(107),
+            108 => $expand!(108),
+            109 => $expand!(109),
+            110 => $expand!(110),
+            111 => $expand!(111),
+            112 => $expand!(112),
+            113 => $expand!(113),
+            114 => $expand!(114),
+            115 => $expand!(115),
+            116 => $expand!(116),
+            117 => $expand!(117),
+            118 => $expand!(118),
+            119 => $expand!(119),
+            120 => $expand!(120),
+            121 => $expand!(121),
+            122 => $expand!(122),
+            123 => $expand!(123),
+            124 => $expand!(124),
+            125 => $expand!(125),
+            126 => $expand!(126),
+            127 => $expand!(127),
+            128 => $expand!(128),
+            129 => $expand!(129),
+            130 => $expand!(130),
+            131 => $expand!(131),
+            132 => $expand!(132),
+            133 => $expand!(133),
+            134 => $expand!(134),
+            135 => $expand!(135),
+            136 => $expand!(136),
+            137 => $expand!(137),
+            138 => $expand!(138),
+            139 => $expand!(139),
+            140 => $expand!(140),
+            141 => $expand!(141),
+            142 => $expand!(142),
+            143 => $expand!(143),
+            144 => $expand!(144),
+            145 => $expand!(145),
+            146 => $expand!(146),
+            147 => $expand!(147),
+            148 => $expand!(148),
+            149 => $expand!(149),
+            150 => $expand!(150),
+            151 => $expand!(151),
+            152 => $expand!(152),
+            153 => $expand!(153),
+            154 => $expand!(154),
+            155 => $expand!(155),
+            156 => $expand!(156),
+            157 => $expand!(157),
+            158 => $expand!(158),
+            159 => $expand!(159),
+            160 => $expand!(160),
+            161 => $expand!(161),
+            162 => $expand!(162),
+            163 => $expand!(163),
+            164 => $expand!(164),
+            165 => $expand!(165),
+            166 => $expand!(166),
+            167 => $expand!(167),
+            168 => $expand!(168),
+            169 => $expand!(169),
+            170 => $expand!(170),
+            171 => $expand!(171),
+            172 => $expand!(172),
+            173 => $expand!(173),
+            174 => $expand!(174),
+            175 => $expand!(175),
+            176 => $expand!(176),
+            177 => $expand!(177),
+            178 => $expand!(178),
+            179 => $expand!(179),
+            180 => $expand!(180),
+            181 => $expand!(181),
+            182 => $expand!(182),
+            183 => $expand!(183),
+            184 => $expand!(184),
+            185 => $expand!(185),
+            186 => $expand!(186),
+            187 => $expand!(187),
+            188 => $expand!(188),
+            189 => $expand!(189),
+            190 => $expand!(190),
+            191 => $expand!(191),
+            192 => $expand!(192),
+            193 => $expand!(193),
+            194 => $expand!(194),
+            195 => $expand!(195),
+            196 => $expand!(196),
+            197 => $expand!(197),
+            198 => $expand!(198),
+            199 => $expand!(199),
+            200 => $expand!(200),
+            201 => $expand!(201),
+            202 => $expand!(202),
+            203 => $expand!(203),
+            204 => $expand!(204),
+            205 => $expand!(205),
+            206 => $expand!(206),
+            207 => $expand!(207),
+            208 => $expand!(208),
+            209 => $expand!(209),
+            210 => $expand!(210),
+            211 => $expand!(211),
+            212 => $expand!(212),
+            213 => $expand!(213),
+            214 => $expand!(214),
+            215 => $expand!(215),
+            216 => $expand!(216),
+            217 => $expand!(217),
+            218 => $expand!(218),
+            219 => $expand!(219),
+            220 => $expand!(220),
+            221 => $expand!(221),
+            222 => $expand!(222),
+            223 => $expand!(223),
+            224 => $expand!(224),
+            225 => $expand!(225),
+            226 => $expand!(226),
+            227 => $expand!(227),
+            228 => $expand!(228),
+            229 => $expand!(229),
+            230 => $expand!(230),
+            231 => $expand!(231),
+            232 => $expand!(232),
+            233 => $expand!(233),
+            234 => $expand!(234),
+            235 => $expand!(235),
+            236 => $expand!(236),
+            237 => $expand!(237),
+            238 => $expand!(238),
+            239 => $expand!(239),
+            240 => $expand!(240),
+            241 => $expand!(241),
+            242 => $expand!(242),
+            243 => $expand!(243),
+            244 => $expand!(244),
+            245 => $expand!(245),
+            246 => $expand!(246),
+            247 => $expand!(247),
+            248 => $expand!(248),
+            249 => $expand!(249),
+            250 => $expand!(250),
+            251 => $expand!(251),
+            252 => $expand!(252),
+            253 => $expand!(253),
+            254 => $expand!(254),
+            _ => $expand!(255),
+        }
+    };
+}
+
+#[allow(unused)]
+macro_rules! types {
+    ($(
+        $(#[$doc:meta])*
+        pub struct $name:ident($($fields:tt)*);
+    )*) => ($(
+        $(#[$doc])*
+        #[derive(Copy, Clone, Debug)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        #[cfg_attr(feature = "cargo-clippy",
+                   allow(clippy::missing_inline_in_public_items))]
+        pub struct $name($($fields)*);
+    )*)
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/mod.rs b/library/stdarch/crates/core_arch/src/mips/mod.rs
new file mode 100644
index 00000000000..e305ffa573e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/mod.rs
@@ -0,0 +1,14 @@
+//! MIPS
+
+mod msa;
+pub use self::msa::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Generates the trap instruction `BREAK`
+#[cfg_attr(test, assert_instr(break))]
+#[inline]
+pub unsafe fn break_() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/msa.rs b/library/stdarch/crates/core_arch/src/mips/msa.rs
new file mode 100644
index 00000000000..6e1d6c3cffa
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/msa.rs
@@ -0,0 +1,62 @@
+//! MIPS SIMD Architecture intrinsics
+//!
+//! The reference is [MIPS Architecture for Programmers Volume IV-j: The
+//! MIPS32 SIMD Architecture Module Revision 1.12][msa_ref].
+//!
+//! [msa_ref]: http://cdn2.imgtec.com/documentation/MD00866-2B-MSA32-AFP-01.12.pdf
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+types! {
+    /// MIPS-specific 128-bit wide vector of 16 packed `i8`.
+    pub struct i8x16(
+        i8, i8, i8, i8, i8, i8, i8, i8,
+        i8, i8, i8, i8, i8, i8, i8, i8,
+    );
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.mips.add.a.b"]
+    fn msa_add_a_b(a: i8x16, b: i8x16) -> i8x16;
+}
+
+/// Vector Add Absolute Values.
+///
+/// Adds the absolute values of the elements in `a` and `b` into the result
+/// vector.
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.b))]
+pub unsafe fn __msa_add_a_b(a: i8x16, b: i8x16) -> i8x16 {
+    msa_add_a_b(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::mips64::msa;
+    use simd::*;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "msa")]
+    unsafe fn __msa_add_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+        );
+        #[rustfmt::skip]
+        let b = i8x16(
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+        );
+        let r = i8x16(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(r, msa::__msa_add_a_b(a, b));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs
new file mode 100644
index 00000000000..f6f986b9579
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@@ -0,0 +1,506 @@
+//! `core_arch`
+
+#[macro_use]
+mod macros;
+
+mod simd;
+
+/// SIMD and vendor intrinsics module.
+///
+/// This module is intended to be the gateway to architecture-specific
+/// intrinsic functions, typically related to SIMD (but not always!). Each
+/// architecture that Rust compiles to may contain a submodule here, which
+/// means that this is not a portable module! If you're writing a portable
+/// library take care when using these APIs!
+///
+/// Under this module you'll find an architecture-named module, such as
+/// `x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
+/// module entry here, only present on that particular target. For example the
+/// `i686-pc-windows-msvc` target will have an `x86` module here, whereas
+/// `x86_64-pc-windows-msvc` has `x86_64`.
+///
+/// [rfc]: https://github.com/rust-lang/rfcs/pull/2325
+/// [tracked]: https://github.com/rust-lang/rust/issues/48556
+///
+/// # Overview
+///
+/// This module exposes vendor-specific intrinsics that typically correspond to
+/// a single machine instruction. These intrinsics are not portable: their
+/// availability is architecture-dependent, and not all machines of that
+/// architecture might provide the intrinsic.
+///
+/// The `arch` module is intended to be a low-level implementation detail for
+/// higher-level APIs. Using it correctly can be quite tricky as you need to
+/// ensure at least a few guarantees are upheld:
+///
+/// * The correct architecture's module is used. For example the `arm` module
+///   isn't available on the `x86_64-unknown-linux-gnu` target. This is
+///   typically done by ensuring that `#[cfg]` is used appropriately when using
+///   this module.
+/// * The CPU the program is currently running on supports the function being
+///   called. For example it is unsafe to call an AVX2 function on a CPU that
+///   doesn't actually support AVX2.
+///
+/// As a result of the latter of these guarantees all intrinsics in this module
+/// are `unsafe` and extra care needs to be taken when calling them!
+///
+/// # CPU Feature Detection
+///
+/// In order to call these APIs in a safe fashion there's a number of
+/// mechanisms available to ensure that the correct CPU feature is available
+/// to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
+/// intrinsics on the `x86` and `x86_64` architectures. This function requires
+/// the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
+/// this function we need to (a) guarantee we only call it on `x86`/`x86_64`
+/// and (b) ensure that the CPU feature is available
+///
+/// [intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100
+///
+/// ## Static CPU Feature Detection
+///
+/// The first option available to us is to conditionally compile code via the
+/// `#[cfg]` attribute. CPU features correspond to the `target_feature` cfg
+/// available, and can be used like so:
+///
+/// ```ignore
+/// #[cfg(
+///     all(
+///         any(target_arch = "x86", target_arch = "x86_64"),
+///         target_feature = "avx2"
+///     )
+/// )]
+/// fn foo() {
+///     #[cfg(target_arch = "x86")]
+///     use std::arch::x86::_mm256_add_epi64;
+///     #[cfg(target_arch = "x86_64")]
+///     use std::arch::x86_64::_mm256_add_epi64;
+///
+///     unsafe {
+///         _mm256_add_epi64(...);
+///     }
+/// }
+/// ```
+///
+/// Here we're using `#[cfg(target_feature = "avx2")]` to conditionally compile
+/// this function into our module. This means that if the `avx2` feature is
+/// *enabled statically* then we'll use the `_mm256_add_epi64` function at
+/// runtime. The `unsafe` block here can be justified through the usage of
+/// `#[cfg]` to only compile the code in situations where the safety guarantees
+/// are upheld.
+///
+/// Statically enabling a feature is typically done with the `-C
+/// target-feature` or `-C target-cpu` flags to the compiler. For example if
+/// your local CPU supports AVX2 then you can compile the above function with:
+///
+/// ```sh
+/// $ RUSTFLAGS='-C target-cpu=native' cargo build
+/// ```
+///
+/// Or otherwise you can specifically enable just the AVX2 feature:
+///
+/// ```sh
+/// $ RUSTFLAGS='-C target-feature=+avx2' cargo build
+/// ```
+///
+/// Note that when you compile a binary with a particular feature enabled it's
+/// important to ensure that you only run the binary on systems which satisfy
+/// the required feature set.
+///
+/// ## Dynamic CPU Feature Detection
+///
+/// Sometimes statically dispatching isn't quite what you want. Instead you
+/// might want to build a portable binary that runs across a variety of CPUs,
+/// but at runtime it selects the most optimized implementation available. This
+/// allows you to build a "least common denominator" binary which has certain
+/// sections more optimized for different CPUs.
+///
+/// Taking our previous example from before, we're going to compile our binary
+/// *without* AVX2 support, but we'd like to enable it for just one function.
+/// We can do that in a manner like:
+///
+/// ```ignore
+/// fn foo() {
+///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+///     {
+///         if is_x86_feature_detected!("avx2") {
+///             return unsafe { foo_avx2() };
+///         }
+///     }
+///
+///     // fallback implementation without using AVX2
+/// }
+///
+/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// #[target_feature(enable = "avx2")]
+/// unsafe fn foo_avx2() {
+///     #[cfg(target_arch = "x86")]
+///     use std::arch::x86::_mm256_add_epi64;
+///     #[cfg(target_arch = "x86_64")]
+///     use std::arch::x86_64::_mm256_add_epi64;
+///
+///     _mm256_add_epi64(...);
+/// }
+/// ```
+///
+/// There's a couple of components in play here, so let's go through them in
+/// detail!
+///
+/// * First up we notice the `is_x86_feature_detected!` macro. Provided by
+///   the standard library, this macro will perform necessary runtime detection
+///   to determine whether the CPU the program is running on supports the
+///   specified feature. In this case the macro will expand to a boolean
+/// expression evaluating to whether the local CPU has the AVX2 feature or
+/// not.
+///
+///   Note that this macro, like the `arch` module, is platform-specific. For
+///   example calling `is_x86_feature_detected!("avx2")` on ARM will be a
+///   compile time error. To ensure we don't hit this error a statement level
+///   `#[cfg]` is used to only compile usage of the macro on `x86`/`x86_64`.
+///
+/// * Next up we see our AVX2-enabled function, `foo_avx2`. This function is
+///   decorated with the `#[target_feature]` attribute which enables a CPU
+///   feature for just this one function. Using a compiler flag like `-C
+///   target-feature=+avx2` will enable AVX2 for the entire program, but using
+///   an attribute will only enable it for the one function. Usage of the
+///   `#[target_feature]` attribute currently requires the function to also be
+///   `unsafe`, as we see here. This is because the function can only be
+///   correctly called on systems which have the AVX2 (like the intrinsics
+///   themselves).
+///
+/// And with all that we should have a working program! This program will run
+/// across all machines and it'll use the optimized AVX2 implementation on
+/// machines where support is detected.
+///
+/// # Ergonomics
+///
+/// It's important to note that using the `arch` module is not the easiest
+/// thing in the world, so if you're curious to try it out you may want to
+/// brace yourself for some wordiness!
+///
+/// The primary purpose of this module is to enable stable crates on crates.io
+/// to build up much more ergonomic abstractions which end up using SIMD under
+/// the hood. Over time these abstractions may also move into the standard
+/// library itself, but for now this module is tasked with providing the bare
+/// minimum necessary to use vendor intrinsics on stable Rust.
+///
+/// # Other architectures
+///
+/// This documentation is only for one particular architecture, you can find
+/// others at:
+///
+/// * [`x86`]
+/// * [`x86_64`]
+/// * [`arm`]
+/// * [`aarch64`]
+/// * [`mips`]
+/// * [`mips64`]
+/// * [`powerpc`]
+/// * [`powerpc64`]
+/// * [`nvptx`]
+/// * [`wasm32`]
+///
+/// [`x86`]: x86/index.html
+/// [`x86_64`]: x86_64/index.html
+/// [`arm`]: arm/index.html
+/// [`aarch64`]: aarch64/index.html
+/// [`mips`]: mips/index.html
+/// [`mips64`]: mips64/index.html
+/// [`powerpc`]: powerpc/index.html
+/// [`powerpc64`]: powerpc64/index.html
+/// [`nvptx`]: nvptx/index.html
+/// [`wasm32`]: wasm32/index.html
+///
+/// # Examples
+///
+/// First let's take a look at not actually using any intrinsics but instead
+/// using LLVM's auto-vectorization to produce optimized vectorized code for
+/// AVX2 and also for the default platform.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox),feature(stdsimd))]
+/// # #[cfg(not(dox))]
+/// # #[macro_use(is_x86_feature_detected)]
+/// # extern crate std_detect;
+///
+/// fn main() {
+///     let mut dst = [0];
+///     add_quickly(&[1], &[2], &mut dst);
+///     assert_eq!(dst[0], 3);
+/// }
+///
+/// fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
+///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+///     {
+///         // Note that this `unsafe` block is safe because we're testing
+///         // that the `avx2` feature is indeed available on our CPU.
+///         if is_x86_feature_detected!("avx2") {
+///             return unsafe { add_quickly_avx2(a, b, c) };
+///         }
+///     }
+///
+///     add_quickly_fallback(a, b, c)
+/// }
+///
+/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// #[target_feature(enable = "avx2")]
+/// unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
+///     add_quickly_fallback(a, b, c) // the function below is inlined here
+/// }
+///
+/// fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
+///     for ((a, b), c) in a.iter().zip(b).zip(c) {
+///         *c = *a + *b;
+///     }
+/// }
+/// ```
+///
+/// Next up let's take a look at an example of manually using intrinsics. Here
+/// we'll be using SSE4.1 features to implement hex encoding.
+///
+/// ```
+/// # #![cfg_attr(not(dox),feature(stdsimd))]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # extern crate core_arch as std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use(is_x86_feature_detected)]
+/// # extern crate std_detect;
+///
+/// fn main() {
+///     let mut dst = [0; 32];
+///     hex_encode(b"\x01\x02\x03", &mut dst);
+///     assert_eq!(&dst[..6], b"010203");
+///
+///     let mut src = [0; 16];
+///     for i in 0..16 {
+///         src[i] = (i + 1) as u8;
+///     }
+///     hex_encode(&src, &mut dst);
+///     assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
+/// }
+///
+/// pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
+///     let len = src.len().checked_mul(2).unwrap();
+///     assert!(dst.len() >= len);
+///
+///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+///     {
+///         if is_x86_feature_detected!("sse4.1") {
+///             return unsafe { hex_encode_sse41(src, dst) };
+///         }
+///     }
+///
+///     hex_encode_fallback(src, dst)
+/// }
+///
+/// // translated from
+/// // https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
+/// #[target_feature(enable = "sse4.1")]
+/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
+///     #[cfg(target_arch = "x86")]
+///     use std::arch::x86::*;
+///     #[cfg(target_arch = "x86_64")]
+///     use std::arch::x86_64::*;
+///
+///     let ascii_zero = _mm_set1_epi8(b'0' as i8);
+///     let nines = _mm_set1_epi8(9);
+///     let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+///     let and4bits = _mm_set1_epi8(0xf);
+///
+///     let mut i = 0_isize;
+///     while src.len() >= 16 {
+///         let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+///
+///         let masked1 = _mm_and_si128(invec, and4bits);
+///         let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+///
+///         // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+///         let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+///         let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+///
+///         // add '0' or the offset depending on the masks
+///         let masked1 = _mm_add_epi8(
+///             masked1,
+///             _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
+///         );
+///         let masked2 = _mm_add_epi8(
+///             masked2,
+///             _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
+///         );
+///
+///         // interleave masked1 and masked2 bytes
+///         let res1 = _mm_unpacklo_epi8(masked2, masked1);
+///         let res2 = _mm_unpackhi_epi8(masked2, masked1);
+///
+///         _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+///         _mm_storeu_si128(
+///             dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
+///             res2,
+///         );
+///         src = &src[16..];
+///         i += 16;
+///     }
+///
+///     let i = i as usize;
+///     hex_encode_fallback(src, &mut dst[i * 2..]);
+/// }
+///
+/// fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
+///     fn hex(byte: u8) -> u8 {
+///         static TABLE: &[u8] = b"0123456789abcdef";
+///         TABLE[byte as usize]
+///     }
+///
+///     for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+///         slots[0] = hex((*byte >> 4) & 0xf);
+///         slots[1] = hex(*byte & 0xf);
+///     }
+/// }
+/// ```
+#[stable(feature = "simd_arch", since = "1.27.0")]
+pub mod arch {
+    /// Platform-specific intrinsics for the `x86` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86", dox))]
+    #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use core_arch::x86::*;
+    }
+
+    /// Platform-specific intrinsics for the `x86_64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86_64", dox))]
+    #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86_64 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use core_arch::x86::*;
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use core_arch::x86_64::*;
+    }
+
+    /// Platform-specific intrinsics for the `arm` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "arm", dox))]
+    #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod arm {
+        pub use core_arch::arm::*;
+    }
+
+    /// Platform-specific intrinsics for the `aarch64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "aarch64", dox))]
+    #[doc(cfg(target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod aarch64 {
+        pub use core_arch::aarch64::*;
+        pub use core_arch::arm::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm32` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "wasm32", dox))]
+    #[doc(cfg(target_arch = "wasm32"))]
+    #[stable(feature = "simd_wasm32", since = "1.33.0")]
+    pub mod wasm32 {
+        #[stable(feature = "simd_wasm32", since = "1.33.0")]
+        pub use core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips", dox))]
+    #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod mips {
+        pub use core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips64", dox))]
+    #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod mips64 {
+        pub use core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc", dox))]
+    #[doc(cfg(target_arch = "powerpc"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod powerpc {
+        pub use core_arch::powerpc::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc64", dox))]
+    #[doc(cfg(target_arch = "powerpc64"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod powerpc64 {
+        pub use core_arch::powerpc64::*;
+    }
+
+    /// Platform-specific intrinsics for the `NVPTX` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "nvptx", target_arch = "nvptx64", dox))]
+    #[doc(cfg(any(target_arch = "nvptx", target_arch = "nvptx64")))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod nvptx {
+        pub use core_arch::nvptx::*;
+    }
+}
+
+mod simd_llvm;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", dox))]
+#[doc(cfg(any(target_arch = "x86", target_arch = "x86_64")))]
+mod x86;
+#[cfg(any(target_arch = "x86_64", dox))]
+#[doc(cfg(target_arch = "x86_64"))]
+mod x86_64;
+
+#[cfg(any(target_arch = "aarch64", dox))]
+#[doc(cfg(target_arch = "aarch64"))]
+mod aarch64;
+#[cfg(any(target_arch = "arm", target_arch = "aarch64", dox))]
+#[doc(cfg(any(target_arch = "arm", target_arch = "aarch64")))]
+mod arm;
+
+#[cfg(any(target_arch = "wasm32", dox))]
+#[doc(cfg(target_arch = "wasm32"))]
+mod wasm32;
+
+#[cfg(any(target_arch = "mips", target_arch = "mips64", dox))]
+#[doc(cfg(any(target_arch = "mips", target_arch = "mips64")))]
+mod mips;
+
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64", dox))]
+#[doc(cfg(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod powerpc;
+
+#[cfg(any(target_arch = "powerpc64", dox))]
+#[doc(cfg(target_arch = "powerpc64"))]
+mod powerpc64;
+
+#[cfg(any(target_arch = "nvptx", target_arch = "nvptx64", dox))]
+#[doc(cfg(any(target_arch = "nvptx", target_arch = "nvptx64")))]
+mod nvptx;
diff --git a/library/stdarch/crates/core_arch/src/nvptx/mod.rs b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
new file mode 100644
index 00000000000..0247d2e44e6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
@@ -0,0 +1,126 @@
+//! NVPTX intrinsics (experimental)
+//!
+//! These intrinsics form the foundation of the CUDA
+//! programming model.
+//!
+//! The reference is the [CUDA C Programming Guide][cuda_c]. Relevant is also
+//! the [LLVM NVPTX Backend documentation][llvm_docs].
+//!
+//! [cuda_c]:
+//! http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+//! [llvm_docs]:
+//! https://llvm.org/docs/NVPTXUsage.html
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.nvvm.barrier0"]
+    fn syncthreads() -> ();
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.x"]
+    fn block_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.y"]
+    fn block_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.z"]
+    fn block_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.x"]
+    fn block_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.y"]
+    fn block_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.z"]
+    fn block_idx_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.x"]
+    fn grid_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.y"]
+    fn grid_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.z"]
+    fn grid_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.x"]
+    fn thread_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.y"]
+    fn thread_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.z"]
+    fn thread_idx_z() -> i32;
+}
+
+/// Synchronizes all threads in the block.
+#[inline]
+pub unsafe fn _syncthreads() -> () {
+    syncthreads()
+}
+
+/// x-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_x() -> i32 {
+    block_dim_x()
+}
+
+/// y-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_y() -> i32 {
+    block_dim_y()
+}
+
+/// z-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_z() -> i32 {
+    block_dim_z()
+}
+
+/// x-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_x() -> i32 {
+    block_idx_x()
+}
+
+/// y-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_y() -> i32 {
+    block_idx_y()
+}
+
+/// z-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_z() -> i32 {
+    block_idx_z()
+}
+
+/// x-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_x() -> i32 {
+    grid_dim_x()
+}
+
+/// y-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_y() -> i32 {
+    grid_dim_y()
+}
+
+/// z-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_z() -> i32 {
+    grid_dim_z()
+}
+
+/// x-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_x() -> i32 {
+    thread_idx_x()
+}
+
+/// y-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_y() -> i32 {
+    thread_idx_y()
+}
+
+/// z-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_z() -> i32 {
+    thread_idx_z()
+}
+
+/// Generates the trap instruction `TRAP`
+#[inline]
+pub unsafe fn trap() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/altivec.rs b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
new file mode 100644
index 00000000000..409b92d9023
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
@@ -0,0 +1,1488 @@
+//! PowerPC AltiVec intrinsics.
+//!
+//! AltiVec is a brandname trademarked by Freescale (previously Motorola) for
+//! the standard `Category:Vector` part of the Power ISA v.2.03 specification.
+//! This Category is also known as VMX (used by IBM), and "Velocity Engine" (a
+//! brand name previously used by Apple).
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+types! {
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `i8`
+    pub struct vector_signed_char(i8, i8, i8, i8, i8, i8, i8, i8,
+                                  i8, i8, i8, i8, i8, i8, i8, i8);
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `u8`
+    pub struct vector_unsigned_char(u8, u8, u8, u8, u8, u8, u8, u8,
+                                    u8, u8, u8, u8, u8, u8, u8, u8);
+
+    /// PowerPC-specific 128-bit wide vector mask of sixteen packed elements
+    pub struct vector_bool_char(i8, i8, i8, i8, i8, i8, i8, i8,
+                                i8, i8, i8, i8, i8, i8, i8, i8);
+    /// PowerPC-specific 128-bit wide vector of eight packed `i16`
+    pub struct vector_signed_short(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// PowerPC-specific 128-bit wide vector of eight packed `u16`
+    pub struct vector_unsigned_short(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// PowerPC-specific 128-bit wide vector mask of eight packed elements
+    pub struct vector_bool_short(i16, i16, i16, i16, i16, i16, i16, i16);
+    // pub struct vector_pixel(???);
+    /// PowerPC-specific 128-bit wide vector of four packed `i32`
+    pub struct vector_signed_int(i32, i32, i32, i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `u32`
+    pub struct vector_unsigned_int(u32, u32, u32, u32);
+    /// PowerPC-specific 128-bit wide vector mask of four packed elements
+    pub struct vector_bool_int(i32, i32, i32, i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `f32`
+    pub struct vector_float(f32, f32, f32, f32);
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.ppc.altivec.vperm"]
+    fn vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmhaddshs"]
+    fn vmhaddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmhraddshs"]
+    fn vmhraddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmsumuhs"]
+    fn vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshs"]
+    fn vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumubm"]
+    fn vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsummbm"]
+    fn vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumuhm"]
+    fn vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshm"]
+    fn vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmaddfp"]
+    fn vmaddfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vnmsubfp"]
+    fn vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vsum2sws"]
+    fn vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4ubs"]
+    fn vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vsum4sbs"]
+    fn vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4shs"]
+    fn vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuleub"]
+    fn vmuleub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulesb"]
+    fn vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmuleuh"]
+    fn vmuleuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulesh"]
+    fn vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuloub"]
+    fn vmuloub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulosb"]
+    fn vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmulouh"]
+    fn vmulouh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulosh"]
+    fn vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+}
+
+mod sealed {
+
+    use super::*;
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleub))]
+    unsafe fn vec_vmuleub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuleub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesb))]
+    unsafe fn vec_vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulesb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleuh))]
+    unsafe fn vec_vmuleuh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmuleuh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesh))]
+    unsafe fn vec_vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulesh(a, b)
+    }
+
+    pub trait VectorMule<Result> {
+        unsafe fn vec_mule(self, b: Self) -> Result;
+    }
+
+    impl VectorMule<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_short {
+            vmuleub(self, b)
+        }
+    }
+    impl VectorMule<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_short {
+            vmulesb(self, b)
+        }
+    }
+    impl VectorMule<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_int {
+            vmuleuh(self, b)
+        }
+    }
+    impl VectorMule<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_int {
+            vmulesh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuloub))]
+    unsafe fn vec_vmuloub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuloub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosb))]
+    unsafe fn vec_vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulosb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulouh))]
+    unsafe fn vec_vmulouh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmulouh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosh))]
+    unsafe fn vec_vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulosh(a, b)
+    }
+
+    pub trait VectorMulo<Result> {
+        unsafe fn vec_mulo(self, b: Self) -> Result;
+    }
+
+    impl VectorMulo<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_short {
+            vmuloub(self, b)
+        }
+    }
+    impl VectorMulo<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_short {
+            vmulosb(self, b)
+        }
+    }
+    impl VectorMulo<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_int {
+            vmulouh(self, b)
+        }
+    }
+    impl VectorMulo<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_int {
+            vmulosh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4ubs))]
+    unsafe fn vec_vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int {
+        vsum4ubs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4sbs))]
+    unsafe fn vec_vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int {
+        vsum4sbs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4shs))]
+    unsafe fn vec_vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int {
+        vsum4shs(a, b)
+    }
+
+    pub trait VectorSum4s<Other> {
+        unsafe fn vec_sum4s(self, b: Other) -> Other;
+    }
+
+    impl VectorSum4s<vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_unsigned_int) -> vector_unsigned_int {
+            vsum4ubs(self, b)
+        }
+    }
+
+    impl VectorSum4s<vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4sbs(self, b)
+        }
+    }
+
+    impl VectorSum4s<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4shs(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum2sws))]
+    unsafe fn vec_vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vnmsubfp))]
+    unsafe fn vec_vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        vnmsubfp(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmaddfp))]
+    unsafe fn vec_vmaddfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        vmaddfp(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumubm))]
+    unsafe fn vec_vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumubm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsummbm))]
+    unsafe fn vec_vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsummbm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhm))]
+    unsafe fn vec_vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshm))]
+    unsafe fn vec_vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshm(a, b, c)
+    }
+
+    pub trait VectorMsum<B, Other> {
+        unsafe fn vec_msum(self, b: B, c: Other) -> Other;
+    }
+
+    impl VectorMsum<vector_unsigned_char, vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumubm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_unsigned_char, vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsummbm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_unsigned_short, vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_short,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumuhm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_signed_short, vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_signed_short,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsumshm(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhs))]
+    unsafe fn vec_vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhs(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshs))]
+    unsafe fn vec_vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshs(a, b, c)
+    }
+
+    pub trait VectorMsums<Other> {
+        unsafe fn vec_msums(self, b: Self, c: Other) -> Other;
+    }
+
+    impl VectorMsums<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_unsigned_int) -> vector_unsigned_int {
+            vmsumuhs(self, b, c)
+        }
+    }
+
+    impl VectorMsums<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_signed_int) -> vector_signed_int {
+            vmsumshs(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vperm))]
+    unsafe fn vec_vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int {
+        vperm(a, b, c)
+    }
+
+    pub trait VectorPerm {
+        unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self;
+    }
+
+    macro_rules! vector_perm {
+        {$impl: ident} => {
+            impl VectorPerm for $impl {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self {
+                    mem::transmute(vec_vperm(mem::transmute(self), mem::transmute(b), c))
+                }
+            }
+        }
+    }
+
+    vector_perm! { vector_signed_char }
+    vector_perm! { vector_unsigned_char }
+    vector_perm! { vector_bool_char }
+
+    vector_perm! { vector_signed_short }
+    vector_perm! { vector_unsigned_short }
+    vector_perm! { vector_bool_short }
+
+    vector_perm! { vector_signed_int }
+    vector_perm! { vector_unsigned_int }
+    vector_perm! { vector_bool_int }
+
+    vector_perm! { vector_float }
+
+    pub trait VectorAdd<Other> {
+        type Result;
+        unsafe fn vec_add(self, other: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_sc(a: vector_bool_char, b: vector_signed_char) -> vector_signed_char {
+        simd_add(::mem::transmute(a), b)
+    }
+    impl VectorAdd<vector_signed_char> for vector_bool_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_bc_sc(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_sc_sc(
+        a: vector_signed_char,
+        b: vector_signed_char,
+    ) -> vector_signed_char {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_sc_sc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_uc(
+        a: vector_bool_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(::mem::transmute(a), b)
+    }
+    impl VectorAdd<vector_unsigned_char> for vector_bool_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_bc_uc(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_uc_uc(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_uc_uc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_ss(
+        a: vector_bool_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        let a: i16x8 = ::mem::transmute(a);
+        let a: vector_signed_short = simd_cast(a);
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_signed_short> for vector_bool_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_bs_ss(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_ss_ss(
+        a: vector_signed_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_ss_ss(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_us(
+        a: vector_bool_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        let a: i16x8 = ::mem::transmute(a);
+        let a: vector_unsigned_short = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_short> for vector_bool_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_bs_us(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_us_us(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_unsigned_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_us_us(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_si(a: vector_bool_int, b: vector_signed_int) -> vector_signed_int {
+        let a: i32x4 = ::mem::transmute(a);
+        let a: vector_signed_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_int> for vector_bool_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_bi_si(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_si_si(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_si_si(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_ui(a: vector_bool_int, b: vector_unsigned_int) -> vector_unsigned_int {
+        let a: i32x4 = ::mem::transmute(a);
+        let a: vector_unsigned_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_int> for vector_bool_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_bi_ui(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_ui_ui(
+        a: vector_unsigned_int,
+        b: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_ui_ui(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xvaddsp))]
+    pub unsafe fn vec_add_float_float(a: vector_float, b: vector_float) -> vector_float {
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_float> for vector_float {
+        type Result = vector_float;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_float) -> Self::Result {
+            vec_add_float_float(self, other)
+        }
+    }
+
+    pub trait VectorMladd<Other> {
+        type Result;
+        unsafe fn vec_mladd(self, b: Other, c: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmladduhm))]
+    unsafe fn mladd(a: i16x8, b: i16x8, c: i16x8) -> i16x8 {
+        simd_add(simd_mul(a, b), c)
+    }
+
+    macro_rules! vector_mladd {
+        ($a: ident, $bc: ident, $d: ident) => {
+            impl VectorMladd<$bc> for $a {
+                type Result = $d;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_mladd(self, b: $bc, c: $bc) -> Self::Result {
+                    let a: i16x8 = ::mem::transmute(self);
+                    let b: i16x8 = ::mem::transmute(b);
+                    let c: i16x8 = ::mem::transmute(c);
+
+                    ::mem::transmute(mladd(a, b, c))
+                }
+            }
+        };
+    }
+
+    vector_mladd! { vector_unsigned_short, vector_unsigned_short, vector_unsigned_short }
+    vector_mladd! { vector_unsigned_short, vector_signed_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_unsigned_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_signed_short, vector_signed_short }
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_add<T, U>(a: T, b: U) -> <T as sealed::VectorAdd<U>>::Result
+where
+    T: sealed::VectorAdd<U>,
+{
+    a.vec_add(b)
+}
+
+/// Endian-biased intrinsics
+#[cfg(target_endian = "little")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        // vperm has big-endian bias
+        //
+        // Xor the mask and flip the arguments
+        let d = ::mem::transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c = simd_xor(c, d);
+
+        b.vec_vperm(a, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        // vsum2sws has big-endian bias
+        //
+        // swap the even b elements with the odd ones
+        let flip = ::mem::transmute(u8x16::new(
+            4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11,
+        ));
+        let b = vec_perm(b, b, flip);
+        let c = vsum2sws(a, b);
+
+        vec_perm(c, c, flip)
+    }
+
+    // Even and Odd are swapped in little-endian
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+}
+
+/// Vector Multiply Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhaddshs))]
+pub unsafe fn vec_madds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhaddshs(a, b, c)
+}
+
+/// Vector Multiply Low and Add Unsigned Half Word
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_mladd<T, U>(a: T, b: U, c: U) -> <T as sealed::VectorMladd<U>>::Result
+where
+    T: sealed::VectorMladd<U>,
+{
+    a.vec_mladd(b, c)
+}
+
+/// Vector Multiply Round and Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhraddshs))]
+pub unsafe fn vec_mradds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhraddshs(a, b, c)
+}
+
+/// Vector Multiply Sum
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_msum<T, B, U>(a: T, b: B, c: U) -> U
+where
+    T: sealed::VectorMsum<B, U>,
+{
+    a.vec_msum(b, c)
+}
+
+/// Vector Multiply Sum Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_msums<T, U>(a: T, b: T, c: U) -> U
+where
+    T: sealed::VectorMsums<U>,
+{
+    a.vec_msums(b, c)
+}
+
+/// Vector Multiply Add
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_madd(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    vmaddfp(a, b, c)
+}
+
+/// Vector Negative Multiply Subtract
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_nmsub(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    vnmsubfp(a, b, c)
+}
+
+/// Vector Sum Across Partial (1/4) Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_sum4s<T, U>(a: T, b: U) -> U
+where
+    T: sealed::VectorSum4s<U>,
+{
+    a.vec_sum4s(b)
+}
+
+#[cfg(target_endian = "big")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        a.vec_vperm(b, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+
+}
+
+pub use self::endian::*;
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "powerpc")]
+    use core_arch::arch::powerpc::*;
+
+    #[cfg(target_arch = "powerpc64")]
+    use core_arch::arch::powerpc64::*;
+
+    use core_arch::simd::*;
+    use stdsimd_test::simd_test;
+
+    macro_rules! test_vec_perm {
+        {$name:ident,
+         $shorttype:ident, $longtype:ident,
+         [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $longtype = ::mem::transmute($shorttype::new($($a),+));
+                let b: $longtype = ::mem::transmute($shorttype::new($($b),+));
+                let c: vector_unsigned_char = ::mem::transmute(u8x16::new($($c),+));
+                let d = $shorttype::new($($d),+);
+
+                let r: $shorttype = ::mem::transmute(vec_perm(a, b, c));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    test_vec_perm! {test_vec_perm_u8x16,
+    u8x16, vector_unsigned_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    test_vec_perm! {test_vec_perm_i8x16,
+    i8x16, vector_signed_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+
+    test_vec_perm! {test_vec_perm_m8x16,
+    m8x16, vector_bool_char,
+    [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
+    test_vec_perm! {test_vec_perm_u16x8,
+    u16x8, vector_unsigned_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_i16x8,
+    i16x8, vector_signed_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_m16x8,
+    m16x8, vector_bool_short,
+    [false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, true, false, true, false, true, false, true]}
+
+    test_vec_perm! {test_vec_perm_u32x4,
+    u32x4, vector_unsigned_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_i32x4,
+    i32x4, vector_signed_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_m32x4,
+    m32x4, vector_bool_int,
+    [false, false, false, false],
+    [true, true, true, true],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [false, true, false, true]}
+    test_vec_perm! {test_vec_perm_f32x4,
+    f32x4, vector_float,
+    [0.0, 1.0, 2.0, 3.0],
+    [1.0, 1.1, 1.2, 1.3],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0.0, 1.0, 1.0, 1.1]}
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madds() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short =
+            ::mem::transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short = ::mem::transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
+
+        assert_eq!(d, ::mem::transmute(vec_madds(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madd_float() {
+        let a: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            0.1 * 0.1 + 0.1,
+            0.2 * 0.2 + 0.2,
+            0.3 * 0.3 + 0.3,
+            0.4 * 0.4 + 0.4,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_madd(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_nmsub_float() {
+        let a: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = ::mem::transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            -(0.1 * 0.1 - 0.1),
+            -(0.2 * 0.2 - 0.2),
+            -(0.3 * 0.3 - 0.3),
+            -(0.4 * 0.4 - 0.4),
+        );
+        assert_eq!(d, ::mem::transmute(vec_nmsub(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mradds() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short =
+            ::mem::transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short =
+            ::mem::transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, i16::max_value() - 1));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, i16::max_value());
+
+        assert_eq!(d, ::mem::transmute(vec_mradds(a, b, c)));
+    }
+
+    macro_rules! test_vec_mladd {
+        {$name:ident, $sa:ident, $la:ident, $sbc:ident, $lbc:ident, $sd:ident,
+            [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $la = ::mem::transmute($sa::new($($a),+));
+                let b: $lbc = ::mem::transmute($sbc::new($($b),+));
+                let c = ::mem::transmute($sbc::new($($c),+));
+                let d = $sd::new($($d),+);
+
+                assert_eq!(d, ::mem::transmute(vec_mladd(a, b, c)));
+            }
+        }
+    }
+
+    test_vec_mladd! { test_vec_mladd_u16x8_u16x8, u16x8, vector_unsigned_short, u16x8, vector_unsigned_short, u16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_u16x8_i16x8, u16x8, vector_unsigned_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_u16x8, i16x8, vector_signed_short, u16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_i16x8, i16x8, vector_signed_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_char() {
+        let a: vector_unsigned_char =
+            ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_char = ::mem::transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c: vector_unsigned_int = ::mem::transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1 + 2 + 3) * 255 + 0,
+            (4 + 5 + 6 + 7) * 255 + 1,
+            (0 + 1 + 2 + 3) * 255 + 2,
+            (4 + 5 + 6 + 7) * 255 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_char() {
+        let a: vector_signed_char = ::mem::transmute(i8x16::new(
+            0, -1, 2, -3, 1, -1, 1, -1, 0, 1, 2, 3, 4, -5, -6, -7,
+        ));
+        let b: vector_unsigned_char =
+            ::mem::transmute(i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
+        let c: vector_signed_int = ::mem::transmute(u32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1 + 2 - 3) + 0,
+            (0) + 1,
+            (0 + 1 + 2 + 3) + 2,
+            (4 - 5 - 6 - 7) + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msum(a, b, c)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_short() {
+        let a: vector_unsigned_short = ::mem::transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            ::mem::transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = ::mem::transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_short() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short =
+            ::mem::transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_unsigned() {
+        let a: vector_unsigned_short = ::mem::transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            ::mem::transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = ::mem::transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_signed() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short =
+            ::mem::transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum2s() {
+        let a: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let b: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0, 0 + 1 + 1, 0, 2 + 3 + 3);
+
+        assert_eq!(d, ::mem::transmute(vec_sum2s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_unsigned_char() {
+        let a: vector_unsigned_char =
+            ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_int = ::mem::transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_char() {
+        let a: vector_signed_char =
+            ::mem::transmute(i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, ::mem::transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_short() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = ::mem::transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0 + 1 + 0, 2 + 3 + 1, 4 + 5 + 2, 6 + 7 + 3);
+
+        assert_eq!(d, ::mem::transmute(vec_sum4s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_char() {
+        let a: vector_unsigned_char =
+            ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, ::mem::transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_char() {
+        let a: vector_signed_char = ::mem::transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, ::mem::transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_short() {
+        let a: vector_unsigned_short = ::mem::transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, ::mem::transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_short() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, ::mem::transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_char() {
+        let a: vector_unsigned_char =
+            ::mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, ::mem::transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_char() {
+        let a: vector_signed_char = ::mem::transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, ::mem::transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_short() {
+        let a: vector_unsigned_short = ::mem::transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, ::mem::transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_short() {
+        let a: vector_signed_short = ::mem::transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, ::mem::transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn vec_add_i32x4_i32x4() {
+        let x = i32x4::new(1, 2, 3, 4);
+        let y = i32x4::new(4, 3, 2, 1);
+        let x: vector_signed_int = ::mem::transmute(x);
+        let y: vector_signed_int = ::mem::transmute(y);
+        let z = vec_add(x, y);
+        assert_eq!(i32x4::splat(5), ::mem::transmute(z));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/mod.rs b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
new file mode 100644
index 00000000000..c7829d30b73
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
@@ -0,0 +1,19 @@
+//! PowerPC intrinsics
+
+#[cfg(target_feature = "altivec")]
+mod altivec;
+#[cfg(target_feature = "altivec")]
+pub use self::altivec::*;
+
+mod vsx;
+pub use self::vsx::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Generates the trap instruction `TRAP`
+#[cfg_attr(test, assert_instr(trap))]
+#[inline]
+pub unsafe fn trap() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/vsx.rs b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
new file mode 100644
index 00000000000..e6d37984071
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
@@ -0,0 +1,116 @@
+//! PowerPC Vector Scalar eXtensions (VSX) intrinsics.
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use core_arch::simd_llvm::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+use mem;
+
+types! {
+    // pub struct vector_Float16 = f16x8;
+    /// PowerPC-specific 128-bit wide vector of two packed `i64`
+    pub struct vector_signed_long(i64, i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `u64`
+    pub struct vector_unsigned_long(u64, u64);
+    /// PowerPC-specific 128-bit wide vector mask of two elements
+    pub struct vector_bool_long(i64, i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `f64`
+    pub struct vector_double(f64, f64);
+    // pub struct vector_signed_long_long = vector_signed_long;
+    // pub struct vector_unsigned_long_long = vector_unsigned_long;
+    // pub struct vector_bool_long_long = vector_bool_long;
+    // pub struct vector_signed___int128 = i128x1;
+    // pub struct vector_unsigned___int128 = i128x1;
+}
+
+mod sealed {
+    use super::*;
+    use core_arch::simd::*;
+
+    pub trait VectorPermDI {
+        unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self;
+    }
+
+    // xxpermdi has an big-endian bias and extended mnemonics
+    #[inline]
+    #[target_feature(enable = "vsx")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
+    unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
+        match dm & 0b11 {
+            0 => simd_shuffle2(a, b, [0b00, 0b10]),
+            1 => simd_shuffle2(a, b, [0b01, 0b10]),
+            2 => simd_shuffle2(a, b, [0b00, 0b11]),
+            _ => simd_shuffle2(a, b, [0b01, 0b11]),
+        }
+    }
+
+    macro_rules! vec_xxpermdi {
+        {$impl: ident} => {
+            impl VectorPermDI for $impl {
+                #[inline]
+                #[target_feature(enable = "vsx")]
+                unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self {
+                    mem::transmute(xxpermdi(mem::transmute(self), mem::transmute(b), dm))
+                }
+            }
+        }
+    }
+
+    vec_xxpermdi! { vector_unsigned_long }
+    vec_xxpermdi! { vector_signed_long }
+    vec_xxpermdi! { vector_bool_long }
+    vec_xxpermdi! { vector_double }
+}
+
+/// Vector permute.
+#[inline]
+#[target_feature(enable = "vsx")]
+#[rustc_args_required_const(2)]
+pub unsafe fn vec_xxpermdi<T>(a: T, b: T, dm: u8) -> T
+where
+    T: sealed::VectorPermDI,
+{
+    a.vec_xxpermdi(b, dm)
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "powerpc")]
+    use core_arch::arch::powerpc::*;
+
+    #[cfg(target_arch = "powerpc64")]
+    use core_arch::arch::powerpc64::*;
+
+    use core_arch::simd::*;
+    use stdsimd_test::simd_test;
+
+    macro_rules! test_vec_xxpermdi {
+        {$name:ident, $shorttype:ident, $longtype:ident, [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "vsx")]
+            unsafe fn $name() {
+                let a: $longtype = ::mem::transmute($shorttype::new($($a),+, $($b),+));
+                let b = ::mem::transmute($shorttype::new($($c),+, $($d),+));
+
+                assert_eq!($shorttype::new($($a),+, $($c),+), ::mem::transmute(vec_xxpermdi(a, b, 0)));
+                assert_eq!($shorttype::new($($b),+, $($c),+), ::mem::transmute(vec_xxpermdi(a, b, 1)));
+                assert_eq!($shorttype::new($($a),+, $($d),+), ::mem::transmute(vec_xxpermdi(a, b, 2)));
+                assert_eq!($shorttype::new($($b),+, $($d),+), ::mem::transmute(vec_xxpermdi(a, b, 3)));
+            }
+        }
+    }
+
+    test_vec_xxpermdi! {test_vec_xxpermdi_u64x2, u64x2, vector_unsigned_long, [0], [1], [2], [3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_i64x2, i64x2, vector_signed_long, [0], [-1], [2], [-3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_m64x2, m64x2, vector_bool_long, [false], [true], [false], [true]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_f64x2, f64x2, vector_double, [0.0], [1.0], [2.0], [3.0]}
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc64/mod.rs b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
new file mode 100644
index 00000000000..2e5c329fc79
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
@@ -0,0 +1,8 @@
+//! PowerPC 64
+//!
+//! The reference is the [64-Bit ELF V2 ABI Specification - Power
+//! Architecture].
+//!
+//! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
+
+pub use core_arch::powerpc::*;
diff --git a/library/stdarch/crates/core_arch/src/simd.rs b/library/stdarch/crates/core_arch/src/simd.rs
new file mode 100644
index 00000000000..468b1e380bd
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/simd.rs
@@ -0,0 +1,191 @@
+//! Internal `#[repr(simd)]` types
+
+#![rustfmt::skip]
+#![allow(non_camel_case_types)]
+
+macro_rules! simd_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        impl $id {
+            #[inline]
+            pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self {
+                $id($($elem_name),*)
+            }
+
+            #[inline]
+            pub(crate) const fn splat(value: $ety) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    value
+                }),*)
+            }
+
+            #[inline]
+            pub(crate) fn extract(self, index: usize) -> $ety {
+                unsafe {
+                    ::core_arch::simd_llvm::simd_extract(self, index as u32)
+                }
+            }
+        }
+    }
+}
+
+macro_rules! simd_m_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        impl $id {
+            #[inline]
+            const fn bool_to_internal(x: bool) -> $ety {
+                [0 as $ety, !(0 as $ety)][x as usize]
+            }
+
+            #[inline]
+            pub(crate) const fn new($($elem_name: bool),*) -> Self {
+                $id($(Self::bool_to_internal($elem_name)),*)
+            }
+
+            #[inline]
+            pub(crate) const fn splat(value: bool) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    Self::bool_to_internal(value)
+                }),*)
+            }
+
+            #[inline]
+            pub(crate) fn extract(self, index: usize) -> bool {
+                let r: $ety = unsafe {
+                    ::core_arch::simd_llvm::simd_extract(self, index as u32)
+                };
+                r != 0
+            }
+        }
+    }
+}
+
+// 16-bit wide types:
+
+simd_ty!(u8x2[u8]: u8, u8 | x0, x1);
+simd_ty!(i8x2[i8]: i8, i8 | x0, x1);
+
+// 32-bit wide types:
+
+simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3);
+simd_ty!(u16x2[u16]: u16, u16 | x0, x1);
+
+simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3);
+simd_ty!(i16x2[i16]: i16, i16 | x0, x1);
+
+// 64-bit wide types:
+
+simd_ty!(u8x8[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3);
+simd_ty!(u32x2[u32]: u32, u32 | x0, x1);
+simd_ty!(u64x1[u64]: u64 | x1);
+
+simd_ty!(i8x8[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3);
+simd_ty!(i32x2[i32]: i32, i32 | x0, x1);
+simd_ty!(i64x1[i64]: i64 | x1);
+
+simd_ty!(f32x2[f32]: f32, f32 | x0, x1);
+
+// 128-bit wide types:
+
+simd_ty!(u8x16[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u16x8[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3);
+simd_ty!(u64x2[u64]: u64, u64 | x0, x1);
+
+simd_ty!(i8x16[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i16x8[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_ty!(i64x2[i64]: i64, i64 | x0, x1);
+
+simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3);
+simd_ty!(f64x2[f64]: f64, f64 | x0, x1);
+
+simd_m_ty!(m8x16[i8]:
+           i8, i8, i8, i8, i8, i8, i8, i8,
+           i8, i8, i8, i8, i8, i8, i8, i8
+           | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_m_ty!(m16x8[i16]:
+           i16, i16, i16, i16, i16, i16, i16, i16
+           | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1);
+
+// 256-bit wide types:
+
+simd_ty!(u8x32[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(u16x16[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16,
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u32x8[u32]:
+         u32, u32, u32, u32, u32, u32, u32, u32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3);
+
+simd_ty!(i8x32[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(i16x16[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16,
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i32x8[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);
+
+// 512-bit wide types:
+
+simd_ty!(i32x16[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32,
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15);
diff --git a/library/stdarch/crates/core_arch/src/simd_llvm.rs b/library/stdarch/crates/core_arch/src/simd_llvm.rs
new file mode 100644
index 00000000000..bbc88d3f32d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/simd_llvm.rs
@@ -0,0 +1,67 @@
+//! LLVM's simd platform intrinsics
+//!
+//! TODO: should use `link_llvm_intrinsic` instead: issue #112
+
+extern "platform-intrinsic" {
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+
+    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
+
+    pub fn simd_cast<T, U>(x: T) -> U;
+
+    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_sub<T>(x: T, y: T) -> T;
+    pub fn simd_mul<T>(x: T, y: T) -> T;
+    pub fn simd_div<T>(x: T, y: T) -> T;
+    pub fn simd_rem<T>(x: T, y: T) -> T;
+    pub fn simd_shl<T>(x: T, y: T) -> T;
+    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_and<T>(x: T, y: T) -> T;
+    pub fn simd_or<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+
+    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_min<T, U>(x: T) -> U;
+    pub fn simd_reduce_max<T, U>(x: T) -> U;
+    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_and<T, U>(x: T) -> U;
+    pub fn simd_reduce_or<T, U>(x: T) -> U;
+    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+    pub fn simd_reduce_all<T>(x: T) -> bool;
+    pub fn simd_reduce_any<T>(x: T) -> bool;
+
+    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    #[cfg(not(stage0))]
+    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+
+    pub fn simd_fmin<T>(a: T, b: T) -> T;
+    pub fn simd_fmax<T>(a: T, b: T) -> T;
+
+    pub fn simd_fsqrt<T>(a: T) -> T;
+    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
+}
+
+// incorrect, but compiles until the bootstrap compiler is updated
+#[cfg(stage0)]
+pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T {
+    drop((m, b));
+    a
+}
diff --git a/library/stdarch/crates/core_arch/src/v64.rs b/library/stdarch/crates/core_arch/src/v64.rs
new file mode 100644
index 00000000000..724f1fb0c44
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/v64.rs
@@ -0,0 +1,85 @@
+//! 64-bit wide vector types
+
+use prelude::v1::*;
+
+use core_arch::simd_llvm::*;
+
+define_ty_doc! {
+    f32x2, f32, f32 |
+    /// A 64-bit vector with 2 `f32` lanes.
+}
+define_impl! { f32x2, f32, 2, i32x2, x0, x1 }
+
+define_ty_doc! {
+    u32x2, u32, u32 |
+    /// A 64-bit vector with 2 `u32` lanes.
+}
+define_impl! { u32x2, u32, 2, i32x2, x0, x1 }
+
+define_ty! { i32x2, i32, i32 }
+define_impl! { i32x2, i32, 2, i32x2, x0, x1 }
+
+define_ty! { u16x4, u16, u16, u16, u16 }
+define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 }
+
+define_ty! { i16x4, i16, i16, i16, i16 }
+define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 }
+
+define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 }
+define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 }
+define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
+define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8);
+define_from!(u16x4, u32x2, i32x2, i16x4, u8x8, i8x8);
+define_from!(i16x4, u32x2, i32x2, u16x4, u8x8, i8x8);
+define_from!(u8x8, u32x2, i32x2, u16x4, i16x4, i8x8);
+define_from!(i8x8, u32x2, i32x2, u16x4, i16x4, u8x8);
+
+define_common_ops!(f32x2, u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
+define_float_ops!(f32x2);
+define_integer_ops!(
+    (u32x2, u32),
+    (i32x2, i32),
+    (u16x4, u16),
+    (i16x4, i16),
+    (u8x8, u8),
+    (i8x8, i8)
+);
+define_signed_integer_ops!(i32x2, i16x4, i8x8);
+define_casts!(
+    (f32x2, f64x2, as_f64x2),
+    (f32x2, u32x2, as_u32x2),
+    (f32x2, i32x2, as_i32x2),
+    (u32x2, f32x2, as_f32x2),
+    (u32x2, i32x2, as_i32x2),
+    (i32x2, f32x2, as_f32x2),
+    (i32x2, u32x2, as_u32x2),
+    (u16x4, i16x4, as_i16x4),
+    (i16x4, u16x4, as_u16x4),
+    (u8x8, i8x8, as_i8x8),
+    (i8x8, u8x8, as_u8x8),
+    (i8x8, i16x8, as_i16x8),
+    (u8x8, i16x8, as_i16x8),
+    (i16x4, i32x4, as_i32x4),
+    (i32x2, i64x2, as_i64x2),
+    (u8x8, u16x8, as_u16x8),
+    (u16x4, u32x4, as_u32x4),
+    (u16x4, i32x4, as_i32x4),
+    (u32x2, u64x2, as_u64x2),
+    (u32x2, i64x2, as_i64x2)
+);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn operators() {
+        test_ops_si!(i8x8, i16x4, i32x2);
+        test_ops_ui!(u8x8, u16x4, u32x2);
+        test_ops_f!(f32x2);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/atomic.rs b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
new file mode 100644
index 00000000000..4ebbaa19bd2
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
@@ -0,0 +1,120 @@
+//! Intrinsics associated with WebAssembly's upcoming threads proposal.
+//!
+//! These intrinsics are all unstable because they're not actually stable in
+//! WebAssembly itself yet. The signatures may change as [the
+//! specification][spec] is updated.
+//!
+//! [spec]: https://github.com/WebAssembly/threads
+
+#![cfg(any(target_feature = "atomics", dox))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
+extern "C" {
+    #[link_name = "llvm.wasm.atomic.wait.i32"]
+    fn llvm_atomic_wait_i32(ptr: *mut i32, exp: i32, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.atomic.wait.i64"]
+    fn llvm_atomic_wait_i64(ptr: *mut i64, exp: i64, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.atomic.notify"]
+    fn llvm_atomic_notify(ptr: *mut i32, cnt: i32) -> i32;
+}
+
+/// Corresponding intrinsic to wasm's [`i32.atomic.wait` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maxinum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// # Availability
+///
+/// This intrinsic is only available **when the standard library itself is
+/// compiled with the `atomics` target feature**. This version of the standard
+/// library is not obtainable via `rustup`, but rather will require the
+/// standard library to be compiled from source.
+///
+/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wait
+#[inline]
+#[cfg_attr(test, assert_instr("i32.atomic.wait"))]
+pub unsafe fn i32_atomic_wait(ptr: *mut i32, expression: i32, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i32(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`i64.atomic.wait` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maxinum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// # Availability
+///
+/// This intrinsic is only available **when the standard library itself is
+/// compiled with the `atomics` target feature**. This version of the standard
+/// library is not obtainable via `rustup`, but rather will require the
+/// standard library to be compiled from source.
+///
+/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wait
+#[inline]
+#[cfg_attr(test, assert_instr("i64.atomic.wait"))]
+pub unsafe fn i64_atomic_wait(ptr: *mut i64, expression: i64, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i64(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`atomic.notify` instruction][instr]
+///
+/// This function will notify a number of threads blocked on the address
+/// indicated by `ptr`. Threads previously blocked with the `i32_atomic_wait`
+/// and `i64_atomic_wait` functions above will be woken up.
+///
+/// The `waiters` argument indicates how many waiters should be woken up (a
+/// maximum). If the value is zero no waiters are woken up.
+///
+/// # Return value
+///
+/// Returns the number of waiters which were actually notified.
+///
+/// # Availability
+///
+/// This intrinsic is only available **when the standard library itself is
+/// compiled with the `atomics` target feature**. This version of the standard
+/// library is not obtainable via `rustup`, but rather will require the
+/// standard library to be compiled from source.
+///
+/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wake
+#[inline]
+#[cfg_attr(test, assert_instr("atomic.wake"))]
+pub unsafe fn atomic_notify(ptr: *mut i32, waiters: u32) -> u32 {
+    llvm_atomic_notify(ptr, waiters as i32) as u32
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/memory.rs b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
new file mode 100644
index 00000000000..20958071837
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
@@ -0,0 +1,64 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
+extern "C" {
+    #[link_name = "llvm.wasm.memory.grow.i32"]
+    fn llvm_memory_grow(mem: i32, pages: i32) -> i32;
+    #[link_name = "llvm.wasm.memory.size.i32"]
+    fn llvm_memory_size(mem: i32) -> i32;
+}
+
+/// Corresponding intrinsic to wasm's [`memory.size` instruction][instr]
+///
+/// This function, when called, will return the current memory size in units of
+/// pages. The current WebAssembly page size is 65536 bytes (64 KB).
+///
+/// The argument `mem` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-size
+#[inline]
+#[cfg_attr(test, assert_instr("memory.size", mem = 0))]
+#[rustc_args_required_const(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+pub fn memory_size(mem: u32) -> usize {
+    unsafe {
+        if mem != 0 {
+            ::intrinsics::abort();
+        }
+        llvm_memory_size(0) as usize
+    }
+}
+
+/// Corresponding intrinsic to wasm's [`memory.grow` instruction][instr]
+///
+/// This function, when called, will attempt to grow the default linear memory
+/// by the specified `delta` of pages. The current WebAssembly page size is
+/// 65536 bytes (64 KB). If memory is successfully grown then the previous size
+/// of memory, in pages, is returned. If memory cannot be grown then
+/// `usize::max_value()` is returned.
+///
+/// The argument `mem` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-grow
+#[inline]
+#[cfg_attr(test, assert_instr("memory.grow", mem = 0))]
+#[rustc_args_required_const(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+pub fn memory_grow(mem: u32, delta: usize) -> usize {
+    unsafe {
+        if mem != 0 {
+            ::intrinsics::abort();
+        }
+        llvm_memory_grow(0, delta as i32) as isize as usize
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/mod.rs b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
new file mode 100644
index 00000000000..056dfc60996
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
@@ -0,0 +1,26 @@
+//! WASM32 intrinsics
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
+#[cfg(any(target_feature = "atomics", dox))]
+mod atomic;
+#[cfg(any(target_feature = "atomics", dox))]
+pub use self::atomic::*;
+
+#[cfg(any(target_feature = "simd128", dox))]
+mod simd128;
+#[cfg(any(target_feature = "simd128", dox))]
+pub use self::simd128::*;
+
+mod memory;
+pub use self::memory::*;
+
+/// Generates the trap instruction `UNREACHABLE`
+#[cfg_attr(test, assert_instr(unreachable))]
+#[inline]
+pub unsafe fn unreachable() -> ! {
+    ::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
new file mode 100644
index 00000000000..5723abab696
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@@ -0,0 +1,2146 @@
+//! This module implements the [WebAssembly `SIMD128` ISA].
+//!
+//! [WebAssembly `SIMD128` ISA]:
+//! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
+
+#![allow(non_camel_case_types)]
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use marker::Sized;
+use mem;
+use ptr;
+use u8;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
+types! {
+    /// WASM-specific 128-bit wide SIMD vector type
+    pub struct v128(i32, i32, i32, i32); // NB: internals here are arbitrary
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "0")]
+pub(crate) trait v128Ext: Sized {
+    fn as_v128(self) -> v128;
+
+    #[inline]
+    fn as_u8x16(self) -> u8x16 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_u16x8(self) -> u16x8 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_u32x4(self) -> u32x4 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_u64x2(self) -> u64x2 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_i8x16(self) -> i8x16 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_i16x8(self) -> i16x8 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_i32x4(self) -> i32x4 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_i64x2(self) -> i64x2 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_f32x4(self) -> f32x4 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+
+    #[inline]
+    fn as_f64x2(self) -> f64x2 {
+        unsafe { mem::transmute(self.as_v128()) }
+    }
+}
+
+impl v128Ext for v128 {
+    #[inline]
+    fn as_v128(self) -> Self {
+        self
+    }
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.wasm.anytrue.v16i8"]
+    fn llvm_i8x16_any_true(x: i8x16) -> i32;
+    #[link_name = "llvm.wasm.alltrue.v16i8"]
+    fn llvm_i8x16_all_true(x: i8x16) -> i32;
+    #[link_name = "llvm.sadd.sat.v16i8"]
+    fn llvm_i8x16_add_saturate_s(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.uadd.sat.v16i8"]
+    fn llvm_i8x16_add_saturate_u(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.sub.saturate.signed.v16i8"]
+    fn llvm_i8x16_sub_saturate_s(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.sub.saturate.unsigned.v16i8"]
+    fn llvm_i8x16_sub_saturate_u(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.wasm.anytrue.v8i16"]
+    fn llvm_i16x8_any_true(x: i16x8) -> i32;
+    #[link_name = "llvm.wasm.alltrue.v8i16"]
+    fn llvm_i16x8_all_true(x: i16x8) -> i32;
+    #[link_name = "llvm.sadd.sat.v8i16"]
+    fn llvm_i16x8_add_saturate_s(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.uadd.sat.v8i16"]
+    fn llvm_i16x8_add_saturate_u(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.sub.saturate.signed.v8i16"]
+    fn llvm_i16x8_sub_saturate_s(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.sub.saturate.unsigned.v8i16"]
+    fn llvm_i16x8_sub_saturate_u(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.wasm.anytrue.v4i32"]
+    fn llvm_i32x4_any_true(x: i32x4) -> i32;
+    #[link_name = "llvm.wasm.alltrue.v4i32"]
+    fn llvm_i32x4_all_true(x: i32x4) -> i32;
+
+    #[link_name = "llvm.wasm.anytrue.v2i64"]
+    fn llvm_i64x2_any_true(x: i64x2) -> i32;
+    #[link_name = "llvm.wasm.alltrue.v2i64"]
+    fn llvm_i64x2_all_true(x: i64x2) -> i32;
+
+    #[link_name = "llvm.fabs.v4f32"]
+    fn llvm_f32x4_abs(x: f32x4) -> f32x4;
+    #[link_name = "llvm.sqrt.v4f32"]
+    fn llvm_f32x4_sqrt(x: f32x4) -> f32x4;
+    #[link_name = "llvm.minimum.v4f32"]
+    fn llvm_f32x4_min(x: f32x4, y: f32x4) -> f32x4;
+    #[link_name = "llvm.maximum.v4f32"]
+    fn llvm_f32x4_max(x: f32x4, y: f32x4) -> f32x4;
+    #[link_name = "llvm.fabs.v2f64"]
+    fn llvm_f64x2_abs(x: f64x2) -> f64x2;
+    #[link_name = "llvm.sqrt.v2f64"]
+    fn llvm_f64x2_sqrt(x: f64x2) -> f64x2;
+    #[link_name = "llvm.minimum.v2f64"]
+    fn llvm_f64x2_min(x: f64x2, y: f64x2) -> f64x2;
+    #[link_name = "llvm.maximum.v2f64"]
+    fn llvm_f64x2_max(x: f64x2, y: f64x2) -> f64x2;
+
+    #[link_name = "llvm.wasm.bitselect.v16i8"]
+    fn llvm_bitselect(a: i8x16, b: i8x16, c: i8x16) -> i8x16;
+}
+
+/// Load a `v128` vector from the given heap address.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load))]
+pub unsafe fn v128_load(m: *const v128) -> v128 {
+    ptr::read(m)
+}
+
+/// Store a `v128` vector to the given heap address.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store))]
+pub unsafe fn v128_store(m: *mut v128, a: v128) {
+    ptr::write(m, a)
+}
+
+/// Materialize a constant SIMD value from the immediate operands.
+///
+/// The `v128.const` instruction is encoded with 16 immediate bytes
+/// `imm` which provide the bits of the vector directly.
+#[inline]
+#[rustc_args_required_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)]
+#[cfg_attr(test, assert_instr(
+    v128.const,
+    a0 = 0,
+    a1 = 1,
+    a2 = 2,
+    a3 = 3,
+    a4 = 4,
+    a5 = 5,
+    a6 = 6,
+    a7 = 7,
+    a8 = 8,
+    a9 = 9,
+    a10 = 10,
+    a11 = 11,
+    a12 = 12,
+    a13 = 13,
+    a14 = 14,
+    a15 = 15,
+))]
+pub const fn v128_const(
+    a0: u8,
+    a1: u8,
+    a2: u8,
+    a3: u8,
+    a4: u8,
+    a5: u8,
+    a6: u8,
+    a7: u8,
+    a8: u8,
+    a9: u8,
+    a10: u8,
+    a11: u8,
+    a12: u8,
+    a13: u8,
+    a14: u8,
+    a15: u8,
+) -> v128 {
+    union U {
+        imm: [u8; 16],
+        vec: v128,
+    }
+    unsafe {
+        U {
+            imm: [
+                a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+            ],
+        }
+        .vec
+    }
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn i8x16_splat(a: i8) -> v128 {
+    unsafe { mem::transmute(i8x16::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 16.
+#[inline]
+#[rustc_args_required_const(1)]
+pub unsafe fn i8x16_extract_lane(a: v128, imm: usize) -> i8 {
+    #[cfg(test)]
+    #[assert_instr(i16x8.extract_lane_s)]
+    fn extract_lane_s(a: v128) -> i32 {
+        unsafe { i8x16_extract_lane(a, 0) as i32 }
+    }
+    #[cfg(test)]
+    #[assert_instr(i8x16.extract_lane_u)]
+    fn extract_lane_u(a: v128) -> u32 {
+        unsafe { i8x16_extract_lane(a, 0) as u32 }
+    }
+    simd_extract(a.as_i8x16(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 16.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
+    mem::transmute(simd_insert(a.as_i8x16(), imm as u32, val))
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn i16x8_splat(a: i16) -> v128 {
+    unsafe { mem::transmute(i16x8::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 8.
+#[inline]
+#[rustc_args_required_const(1)]
+pub unsafe fn i16x8_extract_lane(a: v128, imm: usize) -> i16 {
+    #[cfg(test)]
+    #[assert_instr(i16x8.extract_lane_s)]
+    fn extract_lane_s(a: v128) -> i32 {
+        unsafe { i16x8_extract_lane(a, 0) as i32 }
+    }
+    #[cfg(test)]
+    #[assert_instr(i16x8.extract_lane_u)]
+    fn extract_lane_u(a: v128) -> u32 {
+        unsafe { i16x8_extract_lane(a, 0) as u32 }
+    }
+    simd_extract(a.as_i16x8(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 8.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
+    mem::transmute(simd_insert(a.as_i16x8(), imm as u32, val))
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn i32x4_splat(a: i32) -> v128 {
+    unsafe { mem::transmute(i32x4::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 4.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extract_lane_s, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i32x4_extract_lane(a: v128, imm: usize) -> i32 {
+    simd_extract(a.as_i32x4(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 4.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
+    mem::transmute(simd_insert(a.as_i32x4(), imm as u32, val))
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn i64x2_splat(a: i64) -> v128 {
+    unsafe { mem::transmute(i64x2::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 2.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extract_lane_s, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
+    simd_extract(a.as_i64x2(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 2.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
+    mem::transmute(simd_insert(a.as_i64x2(), imm as u32, val))
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn f32x4_splat(a: f32) -> v128 {
+    unsafe { mem::transmute(f32x4::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 4.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.extract_lane_s, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn f32x4_extract_lane(a: v128, imm: usize) -> f32 {
+    simd_extract(a.as_f32x4(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 4.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
+    mem::transmute(simd_insert(a.as_f32x4(), imm as u32, val))
+}
+
+/// Create vector with identical lanes
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+pub fn f64x2_splat(a: f64) -> v128 {
+    unsafe { mem::transmute(f64x2::splat(a)) }
+}
+
+/// Extract lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Extract the scalar value of lane specified in the immediate mode operand
+/// `imm` from `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 2.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.extract_lane_s, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
+    simd_extract(a.as_f64x2(), imm as u32)
+}
+
+/// Replace a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Replace the scalar value of lane specified in the immediate mode operand
+/// `imm` with `a`.
+///
+/// # Unsafety
+///
+/// This function has undefined behavior if `imm` is greater than or equal to
+/// 2.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.replace_lane, imm = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
+    mem::transmute(simd_insert(a.as_f64x2(), imm as u32, val))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.eq))]
+pub fn i8x16_eq(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_eq::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ne))]
+pub fn i8x16_ne(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ne::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_s))]
+pub fn i8x16_lt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_u))]
+pub fn i8x16_lt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_s))]
+pub fn i8x16_gt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_u))]
+pub fn i8x16_gt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_s))]
+pub fn i8x16_le_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_u))]
+pub fn i8x16_le_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_s))]
+pub fn i8x16_ge_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_u))]
+pub fn i8x16_ge_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.eq))]
+pub fn i16x8_eq(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_eq::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ne))]
+pub fn i16x8_ne(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ne::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_s))]
+pub fn i16x8_lt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_u))]
+pub fn i16x8_lt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_s))]
+pub fn i16x8_gt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_u))]
+pub fn i16x8_gt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_s))]
+pub fn i16x8_le_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_u))]
+pub fn i16x8_le_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_s))]
+pub fn i16x8_ge_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_u))]
+pub fn i16x8_ge_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.eq))]
+pub fn i32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_eq::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ne))]
+pub fn i32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ne::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_s))]
+pub fn i32x4_lt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_u))]
+pub fn i32x4_lt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_s))]
+pub fn i32x4_gt_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_u))]
+pub fn i32x4_gt_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_s))]
+pub fn i32x4_le_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_u))]
+pub fn i32x4_le_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_s))]
+pub fn i32x4_ge_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_u))]
+pub fn i32x4_ge_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.eq))]
+pub fn f32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_eq::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ne))]
+pub fn f32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ne::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.lt))]
+pub fn f32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.gt))]
+pub fn f32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.le))]
+pub fn f32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ge))]
+pub fn f32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.eq))]
+pub fn f64x2_eq(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_eq::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ne))]
+pub fn f64x2_ne(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ne::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.lt))]
+pub fn f64x2_lt(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_lt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.gt))]
+pub fn f64x2_gt(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_gt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.le))]
+pub fn f64x2_le(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_le::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ge))]
+pub fn f64x2_ge(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_ge::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Flips each bit of the 128-bit input vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.not))]
+pub fn v128_not(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_xor(a.as_i64x2(), i64x2(!0, !0))) }
+}
+
+/// Performs a bitwise and of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.and))]
+pub fn v128_and(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_and(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Performs a bitwise or of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.or))]
+pub fn v128_or(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Performs a bitwise xor of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.xor))]
+pub fn v128_xor(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Use the bitmask in `c` to select bits from `v1` when 1 and `v2` when 0.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.bitselect))]
+pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
+    unsafe { mem::transmute(llvm_bitselect(c.as_i8x16(), v1.as_i8x16(), v2.as_i8x16())) }
+}
+
+/// Negates a 128-bit vectors intepreted as sixteen 8-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.neg))]
+pub fn i8x16_neg(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i8x16(), i8x16::splat(-1))) }
+}
+
+/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.any_true))]
+pub fn i8x16_any_true(a: v128) -> i32 {
+    unsafe { llvm_i8x16_any_true(a.as_i8x16()) }
+}
+
+/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.all_true))]
+pub fn i8x16_all_true(a: v128) -> i32 {
+    unsafe { llvm_i8x16_all_true(a.as_i8x16()) }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shl))]
+pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shl(a.as_i8x16(), i8x16::splat(amt as i8))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shl))]
+pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_i8x16(), i8x16::splat(amt as i8))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shl))]
+pub fn i8x16_shr_u(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_u8x16(), u8x16::splat(amt as u8))) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add))]
+pub fn i8x16_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit signed
+/// integers, saturating on overflow to `i8::max_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_saturate_s))]
+pub fn i8x16_add_saturate_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i8x16_add_saturate_s(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit unsigned
+/// integers, saturating on overflow to `u8::max_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_saturate_u))]
+pub fn i8x16_add_saturate_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i8x16_add_saturate_u(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub))]
+pub fn i8x16_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// signed integers, saturating on overflow to `i8::min_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_saturate_s))]
+pub fn i8x16_sub_saturate_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i8x16_sub_saturate_s(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_saturate_u))]
+pub fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Multiplies two 128-bit vectors as if they were two packed sixteen 8-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.mul))]
+pub fn i8x16_mul(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Negates a 128-bit vectors intepreted as eight 16-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.neg))]
+pub fn i16x8_neg(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i16x8(), i16x8::splat(-1))) }
+}
+
+/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.any_true))]
+pub fn i16x8_any_true(a: v128) -> i32 {
+    unsafe { llvm_i16x8_any_true(a.as_i16x8()) }
+}
+
+/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.all_true))]
+pub fn i16x8_all_true(a: v128) -> i32 {
+    unsafe { llvm_i16x8_all_true(a.as_i16x8()) }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shl))]
+pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shl(a.as_i16x8(), i16x8::splat(amt as i16))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shl))]
+pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_i16x8(), i16x8::splat(amt as i16))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shl))]
+pub fn i16x8_shr_u(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_u16x8(), u16x8::splat(amt as u16))) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add))]
+pub fn i16x8_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit signed
+/// integers, saturating on overflow to `i16::max_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_saturate_s))]
+pub fn i16x8_add_saturate_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i16x8_add_saturate_s(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit unsigned
+/// integers, saturating on overflow to `u16::max_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_saturate_u))]
+pub fn i16x8_add_saturate_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i16x8_add_saturate_u(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub))]
+pub fn i16x8_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers, saturating on overflow to `i16::min_value()`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_saturate_s))]
+pub fn i16x8_sub_saturate_s(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i16x8_sub_saturate_s(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_saturate_u))]
+pub fn i16x8_sub_saturate_u(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_i16x8_sub_saturate_u(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Multiplies two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.mul))]
+pub fn i16x8_mul(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Negates a 128-bit vectors intepreted as four 32-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.neg))]
+pub fn i32x4_neg(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i32x4(), i32x4::splat(-1))) }
+}
+
+/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.any_true))]
+pub fn i32x4_any_true(a: v128) -> i32 {
+    unsafe { llvm_i32x4_any_true(a.as_i32x4()) }
+}
+
+/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.all_true))]
+pub fn i32x4_all_true(a: v128) -> i32 {
+    unsafe { llvm_i32x4_all_true(a.as_i32x4()) }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shl))]
+pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shl(a.as_i32x4(), i32x4::splat(amt as i32))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shl))]
+pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_i32x4(), i32x4::splat(amt as i32))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shl))]
+pub fn i32x4_shr_u(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_u32x4(), u32x4::splat(amt as u32))) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.add))]
+pub fn i32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.sub))]
+pub fn i32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiplies two 128-bit vectors as if they were two packed four 32-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.mul))]
+pub fn i32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Negates a 128-bit vectors intepreted as two 64-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.neg))]
+pub fn i64x2_neg(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
+}
+
+/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.any_true))]
+pub fn i64x2_any_true(a: v128) -> i32 {
+    unsafe { llvm_i64x2_any_true(a.as_i64x2()) }
+}
+
+/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.all_true))]
+pub fn i64x2_all_true(a: v128) -> i32 {
+    unsafe { llvm_i64x2_all_true(a.as_i64x2()) }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shl))]
+pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shl(a.as_i64x2(), i64x2::splat(amt as i64))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shl))]
+pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_i64x2(), i64x2::splat(amt as i64))) }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shl))]
+pub fn i64x2_shr_u(a: v128, amt: u32) -> v128 {
+    unsafe { mem::transmute(simd_shr(a.as_u64x2(), u64x2::splat(amt as u64))) }
+}
+
+/// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.add))]
+pub fn i64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.sub))]
+pub fn i64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.abs))]
+pub fn f32x4_abs(a: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f32x4_abs(a.as_f32x4())) }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as four 32-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.abs))]
+pub fn f32x4_neg(a: v128) -> v128 {
+    unsafe { f32x4_mul(a, mem::transmute(f32x4(-1.0, -1.0, -1.0, -1.0))) }
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sqrt))]
+pub fn f32x4_sqrt(a: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f32x4_sqrt(a.as_f32x4())) }
+}
+
+/// Adds pairwise lanes of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.add))]
+pub fn f32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Subtracts pairwise lanes of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sub))]
+pub fn f32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Multiplies pairwise lanes of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.mul))]
+pub fn f32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Divides pairwise lanes of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.div))]
+pub fn f32x4_div(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_div(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.min))]
+pub fn f32x4_min(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f32x4_min(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.max))]
+pub fn f32x4_max(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f32x4_max(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.abs))]
+pub fn f64x2_abs(a: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f64x2_abs(a.as_f64x2())) }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.abs))]
+pub fn f64x2_neg(a: v128) -> v128 {
+    unsafe { f64x2_mul(a, mem::transmute(f64x2(-1.0, -1.0))) }
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sqrt))]
+pub fn f64x2_sqrt(a: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f64x2_sqrt(a.as_f64x2())) }
+}
+
+/// Adds pairwise lanes of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.add))]
+pub fn f64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_add(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Subtracts pairwise lanes of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sub))]
+pub fn f64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_sub(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Multiplies pairwise lanes of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.mul))]
+pub fn f64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Divides pairwise lanes of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.div))]
+pub fn f64x2_div(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(simd_div(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.min))]
+pub fn f64x2_min(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f64x2_min(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.max))]
+pub fn f64x2_max(a: v128, b: v128) -> v128 {
+    unsafe { mem::transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2())) }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit signed integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr("i32x4.trunc_s/f32x4:sat"))]
+pub fn i32x4_trunc_s_f32x4_sat(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, i32x4>(a.as_f32x4())) }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit unsigned integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr("i32x4.trunc_u/f32x4:sat"))]
+pub fn i32x4_trunc_u_f32x4_sat(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, u32x4>(a.as_f32x4())) }
+}
+
+/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
+/// into a 128-bit vector of two 64-bit signed integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr("i32x4.trunc_s/f32x4:sat"))]
+pub fn i64x2_trunc_s_f64x2_sat(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, i64x2>(a.as_f64x2())) }
+}
+
+/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
+/// into a 128-bit vector of two 64-bit unsigned integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr("i64x2.trunc_u/f64x2:sat"))]
+pub fn i64x2_trunc_u_f64x2_sat(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, u64x2>(a.as_f64x2())) }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr("f32x4.convert_s/i32x4"))]
+pub fn f32x4_convert_s_i32x4(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit unsigned integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr("f32x4.convert_u/i32x4"))]
+pub fn f32x4_convert_u_i32x4(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, f32x4>(a.as_u32x4())) }
+}
+
+/// Converts a 128-bit vector interpreted as two 64-bit signed integers into a
+/// 128-bit vector of two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr("f64x2.convert_s/i64x2"))]
+pub fn f64x2_convert_s_i64x2(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, f64x2>(a.as_i64x2())) }
+}
+
+/// Converts a 128-bit vector interpreted as two 64-bit unsigned integers into a
+/// 128-bit vector of two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr("f64x2.convert_u/i64x2"))]
+pub fn f64x2_convert_u_i64x2(a: v128) -> v128 {
+    unsafe { mem::transmute(simd_cast::<_, f64x2>(a.as_u64x2())) }
+}
+
+// #[cfg(test)]
+// pub mod tests {
+//     use super::*;
+//     use std;
+//     use std::mem;
+//     use std::prelude::v1::*;
+//     use wasm_bindgen_test::*;
+//
+//     fn compare_bytes(a: v128, b: v128) {
+//         let a: [u8; 16] = unsafe { mem::transmute(a) };
+//         let b: [u8; 16] = unsafe { mem::transmute(b) };
+//         assert_eq!(a, b);
+//     }
+//
+//     #[wasm_bindgen_test]
+//     fn v128_const() {
+//         const A: v128 = unsafe {
+//             v128::const_([
+//                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+//             ])
+//         };
+//         compare_bytes(A, A);
+//     }
+//
+//     macro_rules! test_splat {
+//         ($test_id:ident: $id:ident($val:expr) => $($vals:expr),*) => {
+//             #[wasm_bindgen_test]
+//             fn $test_id() {
+//                 const A: v128 = unsafe {
+//                     $id::splat($val)
+//                 };
+//                 const B: v128 = unsafe {
+//                     v128::const_([$($vals),*])
+//                 };
+//                 compare_bytes(A, B);
+//             }
+//         }
+//     }
+//
+//     test_splat!(i8x16_splat: i8x16(42) => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
+//     test_splat!(i16x8_splat: i16x8(42) => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
+//     test_splat!(i32x4_splat: i32x4(42) => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
+//     test_splat!(i64x2_splat: i64x2(42) => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
+//     test_splat!(f32x4_splat: f32x4(42.) => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
+//     test_splat!(f64x2_splat: f64x2(42.) => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
+//
+//     // tests extract and replace lanes
+//     macro_rules! test_extract {
+//         ($test_id:ident: $id:ident[$ety:ident] => $extract_fn:ident | [$val:expr; $count:expr]
+//          | [$($vals:expr),*] => ($other:expr)
+//          | $($ids:expr),*) => {
+//             #[wasm_bindgen_test]
+//             fn $test_id() {
+//                 unsafe {
+//                     // splat vector and check that all indices contain the same value
+//                     // splatted:
+//                     const A: v128 = unsafe {
+//                         $id::splat($val)
+//                     };
+//                     $(
+//                         assert_eq!($id::$extract_fn(A, $ids) as $ety, $val);
+//                     )*;
+//
+//                     // create a vector from array and check that the indices contain
+//                     // the same values as in the array:
+//                     let arr: [$ety; $count] = [$($vals),*];
+//                     let mut vec: v128 = mem::transmute(arr);
+//                     $(
+//                         assert_eq!($id::$extract_fn(vec, $ids) as $ety, arr[$ids]);
+//                     )*;
+//
+//                     // replace lane 0 with another value
+//                     vec = $id::replace_lane(vec, 0, $other);
+//                     assert_ne!($id::$extract_fn(vec, 0) as $ety, arr[0]);
+//                     assert_eq!($id::$extract_fn(vec, 0) as $ety, $other);
+//                 }
+//             }
+//         }
+//     }
+//
+//     test_extract!(i8x16_extract_u: i8x16[u8] => extract_lane_u | [255; 16]
+//                   | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => (42)
+//                   | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+//     );
+//     test_extract!(i8x16_extract_s: i8x16[i8] => extract_lane_s | [-122; 16]
+//                   | [0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15] => (-42)
+//                   | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+//     );
+//
+//     test_extract!(i16x8_extract_u: i16x8[u16] => extract_lane_u | [255; 8]
+//                   | [0, 1, 2, 3, 4, 5, 6, 7]  => (42) | 0, 1, 2, 3, 4, 5, 6, 7
+//     );
+//     test_extract!(i16x8_extract_s: i16x8[i16] => extract_lane_s | [-122; 8]
+//                   | [0, -1, 2, -3, 4, -5, 6, -7]  => (-42) | 0, 1, 2, 3, 4, 5, 6, 7
+//     );
+//     test_extract!(i32x4_extract: i32x4[i32] => extract_lane | [-122; 4]
+//                   | [0, -1, 2, -3]  => (42) | 0, 1, 2, 3
+//     );
+//     test_extract!(i64x2_extract: i64x2[i64] => extract_lane | [-122; 2]
+//                   | [0, -1]  => (42) | 0, 1
+//     );
+//     test_extract!(f32x4_extract: f32x4[f32] => extract_lane | [-122.; 4]
+//                   | [0., -1., 2., -3.]  => (42.) | 0, 1, 2, 3
+//     );
+//     test_extract!(f64x2_extract: f64x2[f64] => extract_lane | [-122.; 2]
+//                   | [0., -1.]  => (42.) | 0, 1
+//     );
+//
+//     #[wasm_bindgen_test]
+//     fn v8x16_shuffle() {
+//         unsafe {
+//             let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+//             let b = [
+//                 16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+//                 31,
+//             ];
+//
+//             let vec_a: v128 = mem::transmute(a);
+//             let vec_b: v128 = mem::transmute(b);
+//
+//             let vec_r = v8x16_shuffle!(
+//                 vec_a,
+//                 vec_b,
+//                 [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+//             );
+//
+//             let e =
+//                 [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
+//             let vec_e: v128 = mem::transmute(e);
+//             compare_bytes(vec_r, vec_e);
+//         }
+//     }
+//
+//     macro_rules! floating_point {
+//         (f32) => {
+//             true
+//         };
+//         (f64) => {
+//             true
+//         };
+//         ($id:ident) => {
+//             false
+//         };
+//     }
+//
+//     trait IsNan: Sized {
+//         fn is_nan(self) -> bool {
+//             false
+//         }
+//     }
+//     impl IsNan for i8 {}
+//     impl IsNan for i16 {}
+//     impl IsNan for i32 {}
+//     impl IsNan for i64 {}
+//
+//     macro_rules! test_bop {
+//         ($id:ident[$ety:ident; $ecount:expr] |
+//          $binary_op:ident [$op_test_id:ident] :
+//          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+//             test_bop!(
+//                 $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+//                 ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+//             );
+//
+//         };
+//         ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+//          $binary_op:ident [$op_test_id:ident] :
+//          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+//             #[wasm_bindgen_test]
+//             fn $op_test_id() {
+//                 unsafe {
+//                     let a_input: [$ety; $ecount] = [$($in_a),*];
+//                     let b_input: [$ety; $ecount] = [$($in_b),*];
+//                     let output: [$oty; $ecount] = [$($out),*];
+//
+//                     let a_vec_in: v128 = mem::transmute(a_input);
+//                     let b_vec_in: v128 = mem::transmute(b_input);
+//                     let vec_res: v128 = $id::$binary_op(a_vec_in, b_vec_in);
+//
+//                     let res: [$oty; $ecount] = mem::transmute(vec_res);
+//
+//                     if !floating_point!($ety) {
+//                         assert_eq!(res, output);
+//                     } else {
+//                         for i in 0..$ecount {
+//                             let r = res[i];
+//                             let o = output[i];
+//                             assert_eq!(r.is_nan(), o.is_nan());
+//                             if !r.is_nan() {
+//                                 assert_eq!(r, o);
+//                             }
+//                         }
+//                     }
+//                 }
+//             }
+//         }
+//     }
+//
+//     macro_rules! test_bops {
+//         ($id:ident[$ety:ident; $ecount:expr] |
+//          $binary_op:ident [$op_test_id:ident]:
+//          ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+//             #[wasm_bindgen_test]
+//             fn $op_test_id() {
+//                 unsafe {
+//                     let a_input: [$ety; $ecount] = [$($in_a),*];
+//                     let output: [$ety; $ecount] = [$($out),*];
+//
+//                     let a_vec_in: v128 = mem::transmute(a_input);
+//                     let vec_res: v128 = $id::$binary_op(a_vec_in, $in_b);
+//
+//                     let res: [$ety; $ecount] = mem::transmute(vec_res);
+//                     assert_eq!(res, output);
+//                 }
+//             }
+//         }
+//     }
+//
+//     macro_rules! test_uop {
+//         ($id:ident[$ety:ident; $ecount:expr] |
+//          $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+//             #[wasm_bindgen_test]
+//             fn $op_test_id() {
+//                 unsafe {
+//                     let a_input: [$ety; $ecount] = [$($in_a),*];
+//                     let output: [$ety; $ecount] = [$($out),*];
+//
+//                     let a_vec_in: v128 = mem::transmute(a_input);
+//                     let vec_res: v128 = $id::$unary_op(a_vec_in);
+//
+//                     let res: [$ety; $ecount] = mem::transmute(vec_res);
+//                     assert_eq!(res, output);
+//                 }
+//             }
+//         }
+//     }
+//
+//     test_bop!(i8x16[i8; 16] | add[i8x16_add_test]:
+//               ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1],
+//                [8, i8::min_value(), 10, 11, 12, 13, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1]) =>
+//               [8, i8::max_value(), 12, 14, 16, 18, 20, i8::min_value(), 2, 2, 2, 2, 2, 2, 2, 2]);
+//     test_bop!(i8x16[i8; 16] | sub[i8x16_sub_test]:
+//               ([0, -1, 2, 3, 4, 5, 6, -1, 1, 1, 1, 1, 1, 1, 1, 1],
+//                [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1]) =>
+//               [-8, i8::max_value(), -8, -8, -8, -8, -8, i8::min_value(), 0, 0, 0, 0, 0, 0, 0, 0]);
+//     test_bop!(i8x16[i8; 16] | mul[i8x16_mul_test]:
+//               ([0, -2, 2, 3, 4, 5, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+//                [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1]) =>
+//               [0, 0, 20, 33, 48, 65, 84, -2, 1, 1, 1, 1, 1, 1, 1, 1]);
+//     test_uop!(i8x16[i8; 16] | neg[i8x16_neg_test]:
+//               [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1] =>
+//               [-8, i8::min_value(), -10, -11, -12, -13, -14, i8::min_value() + 1, -1, -1, -1, -1, -1, -1, -1, -1]);
+//
+//     test_bop!(i16x8[i16; 8] | add[i16x8_add_test]:
+//               ([0, -1, 2, 3, 4, 5, 6, i16::max_value()],
+//                [8, i16::min_value(), 10, 11, 12, 13, 14, 1]) =>
+//               [8, i16::max_value(), 12, 14, 16, 18, 20, i16::min_value()]);
+//     test_bop!(i16x8[i16; 8] | sub[i16x8_sub_test]:
+//               ([0, -1, 2, 3, 4, 5, 6, -1],
+//                [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()]) =>
+//               [-8, i16::max_value(), -8, -8, -8, -8, -8, i16::min_value()]);
+//     test_bop!(i16x8[i16; 8] | mul[i16x8_mul_test]:
+//               ([0, -2, 2, 3, 4, 5, 6, 2],
+//                [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()]) =>
+//               [0, 0, 20, 33, 48, 65, 84, -2]);
+//     test_uop!(i16x8[i16; 8] | neg[i16x8_neg_test]:
+//               [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()] =>
+//               [-8, i16::min_value(), -10, -11, -12, -13, -14, i16::min_value() + 1]);
+//
+//     test_bop!(i32x4[i32; 4] | add[i32x4_add_test]:
+//               ([0, -1, 2, i32::max_value()],
+//                [8, i32::min_value(), 10, 1]) =>
+//               [8, i32::max_value(), 12, i32::min_value()]);
+//     test_bop!(i32x4[i32; 4] | sub[i32x4_sub_test]:
+//               ([0, -1, 2, -1],
+//                [8, i32::min_value(), 10, i32::max_value()]) =>
+//               [-8, i32::max_value(), -8, i32::min_value()]);
+//     test_bop!(i32x4[i32; 4] | mul[i32x4_mul_test]:
+//               ([0, -2, 2, 2],
+//                [8, i32::min_value(), 10, i32::max_value()]) =>
+//               [0, 0, 20, -2]);
+//     test_uop!(i32x4[i32; 4] | neg[i32x4_neg_test]:
+//               [8, i32::min_value(), 10, i32::max_value()] =>
+//               [-8, i32::min_value(), -10, i32::min_value() + 1]);
+//
+//     test_bop!(i64x2[i64; 2] | add[i64x2_add_test]:
+//               ([-1, i64::max_value()],
+//                [i64::min_value(), 1]) =>
+//               [i64::max_value(), i64::min_value()]);
+//     test_bop!(i64x2[i64; 2] | sub[i64x2_sub_test]:
+//               ([-1, -1],
+//                [i64::min_value(), i64::max_value()]) =>
+//               [ i64::max_value(), i64::min_value()]);
+//     // note: mul for i64x2 is not part of the spec
+//     test_uop!(i64x2[i64; 2] | neg[i64x2_neg_test]:
+//               [i64::min_value(), i64::max_value()] =>
+//               [i64::min_value(), i64::min_value() + 1]);
+//
+//     test_bops!(i8x16[i8; 16] | shl[i8x16_shl_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+//                [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+//     test_bops!(i16x8[i16; 8] | shl[i16x8_shl_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+//                [0, -2, 4, 6, 8, 10, 12, -2]);
+//     test_bops!(i32x4[i32; 4] | shl[i32x4_shl_test]:
+//                ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+//     test_bops!(i64x2[i64; 2] | shl[i64x2_shl_test]:
+//                ([0, -1], 1) => [0, -2]);
+//
+//     test_bops!(i8x16[i8; 16] | shr_s[i8x16_shr_s_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+//                [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+//     test_bops!(i16x8[i16; 8] | shr_s[i16x8_shr_s_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+//                [0, -1, 1, 1, 2, 2, 3, i16::max_value() / 2]);
+//     test_bops!(i32x4[i32; 4] | shr_s[i32x4_shr_s_test]:
+//                ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+//     test_bops!(i64x2[i64; 2] | shr_s[i64x2_shr_s_test]:
+//                ([0, -1], 1) => [0, -1]);
+//
+//     test_bops!(i8x16[i8; 16] | shr_u[i8x16_uhr_u_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+//                [0, i8::max_value(), 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+//     test_bops!(i16x8[i16; 8] | shr_u[i16x8_uhr_u_test]:
+//                ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+//                [0, i16::max_value(), 1, 1, 2, 2, 3, i16::max_value() / 2]);
+//     test_bops!(i32x4[i32; 4] | shr_u[i32x4_uhr_u_test]:
+//                ([0, -1, 2, 3], 1) => [0, i32::max_value(), 1, 1]);
+//     test_bops!(i64x2[i64; 2] | shr_u[i64x2_uhr_u_test]:
+//                ([0, -1], 1) => [0, i64::max_value()]);
+//
+//     #[wasm_bindgen_test]
+//     fn v128_bitwise_logical_ops() {
+//         unsafe {
+//             let a: [u32; 4] = [u32::max_value(), 0, u32::max_value(), 0];
+//             let b: [u32; 4] = [u32::max_value(); 4];
+//             let c: [u32; 4] = [0; 4];
+//
+//             let vec_a: v128 = mem::transmute(a);
+//             let vec_b: v128 = mem::transmute(b);
+//             let vec_c: v128 = mem::transmute(c);
+//
+//             let r: v128 = v128::and(vec_a, vec_a);
+//             compare_bytes(r, vec_a);
+//             let r: v128 = v128::and(vec_a, vec_b);
+//             compare_bytes(r, vec_a);
+//             let r: v128 = v128::or(vec_a, vec_b);
+//             compare_bytes(r, vec_b);
+//             let r: v128 = v128::not(vec_b);
+//             compare_bytes(r, vec_c);
+//             let r: v128 = v128::xor(vec_a, vec_c);
+//             compare_bytes(r, vec_a);
+//
+//             let r: v128 = v128::bitselect(vec_b, vec_c, vec_b);
+//             compare_bytes(r, vec_b);
+//             let r: v128 = v128::bitselect(vec_b, vec_c, vec_c);
+//             compare_bytes(r, vec_c);
+//             let r: v128 = v128::bitselect(vec_b, vec_c, vec_a);
+//             compare_bytes(r, vec_a);
+//         }
+//     }
+//
+//     macro_rules! test_bool_red {
+//         ($id:ident[$test_id:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+//             #[wasm_bindgen_test]
+//             fn $test_id() {
+//                 unsafe {
+//                     let vec_a: v128 = mem::transmute([$($true),*]); // true
+//                     let vec_b: v128 = mem::transmute([$($false),*]); // false
+//                     let vec_c: v128 = mem::transmute([$($alt),*]); // alternating
+//
+//                     assert_eq!($id::any_true(vec_a), 1);
+//                     assert_eq!($id::any_true(vec_b), 0);
+//                     assert_eq!($id::any_true(vec_c), 1);
+//
+//                     assert_eq!($id::all_true(vec_a), 1);
+//                     assert_eq!($id::all_true(vec_b), 0);
+//                     assert_eq!($id::all_true(vec_c), 0);
+//                 }
+//             }
+//         }
+//     }
+//
+//     test_bool_red!(
+//         i8x16[i8x16_boolean_reductions]
+//             | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+//             | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+//             | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+//     );
+//     test_bool_red!(
+//         i16x8[i16x8_boolean_reductions]
+//             | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+//             | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+//             | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+//     );
+//     test_bool_red!(
+//         i32x4[i32x4_boolean_reductions]
+//             | [1_i32, 1, 1, 1]
+//             | [0_i32, 0, 0, 0]
+//             | [1_i32, 0, 1, 0]
+//     );
+//     test_bool_red!(
+//         i64x2[i64x2_boolean_reductions] | [1_i64, 1] | [0_i64, 0] | [1_i64, 0]
+//     );
+//
+//     test_bop!(i8x16[i8; 16] | eq[i8x16_eq_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+//                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i16x8[i16; 8] | eq[i16x8_eq_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i32x4[i32; 4] | eq[i32x4_eq_test]:
+//               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+//     test_bop!(i64x2[i64; 2] | eq[i64x2_eq_test]: ([0, 1], [0, 2]) => [-1, 0]);
+//     test_bop!(f32x4[f32; 4] => i32 | eq[f32x4_eq_test]:
+//               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+//     test_bop!(f64x2[f64; 2] => i64 | eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+//
+//     test_bop!(i8x16[i8; 16] | ne[i8x16_ne_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+//                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i16x8[i16; 8] | ne[i16x8_ne_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i32x4[i32; 4] | ne[i32x4_ne_test]:
+//               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+//     test_bop!(i64x2[i64; 2] | ne[i64x2_ne_test]: ([0, 1], [0, 2]) => [0, -1]);
+//     test_bop!(f32x4[f32; 4] => i32 | ne[f32x4_ne_test]:
+//               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+//     test_bop!(f64x2[f64; 2] => i64 | ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+//
+//     test_bop!(i8x16[i8; 16] | lt[i8x16_lt_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+//                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i16x8[i16; 8] | lt[i16x8_lt_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i32x4[i32; 4] | lt[i32x4_lt_test]:
+//               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+//     test_bop!(i64x2[i64; 2] | lt[i64x2_lt_test]: ([0, 1], [0, 2]) => [0, -1]);
+//     test_bop!(f32x4[f32; 4] => i32 | lt[f32x4_lt_test]:
+//               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+//     test_bop!(f64x2[f64; 2] => i64 | lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+//
+//     test_bop!(i8x16[i8; 16] | gt[i8x16_gt_test]:
+//           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+//            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i16x8[i16; 8] | gt[i16x8_gt_test]:
+//               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+//               [0, -1, 0, -1 ,0, -1, 0, 0]);
+//     test_bop!(i32x4[i32; 4] | gt[i32x4_gt_test]:
+//               ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+//     test_bop!(i64x2[i64; 2] | gt[i64x2_gt_test]: ([0, 2], [0, 1]) => [0, -1]);
+//     test_bop!(f32x4[f32; 4] => i32 | gt[f32x4_gt_test]:
+//               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+//     test_bop!(f64x2[f64; 2] => i64 | gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+//
+//     test_bop!(i8x16[i8; 16] | ge[i8x16_ge_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+//                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i16x8[i16; 8] | ge[i16x8_ge_test]:
+//               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i32x4[i32; 4] | ge[i32x4_ge_test]:
+//               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+//     test_bop!(i64x2[i64; 2] | ge[i64x2_ge_test]: ([0, 1], [0, 2]) => [-1, 0]);
+//     test_bop!(f32x4[f32; 4] => i32 | ge[f32x4_ge_test]:
+//               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+//     test_bop!(f64x2[f64; 2] => i64 | ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+//
+//     test_bop!(i8x16[i8; 16] | le[i8x16_le_test]:
+//               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+//                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+//               ) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i16x8[i16; 8] | le[i16x8_le_test]:
+//               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+//               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+//     test_bop!(i32x4[i32; 4] | le[i32x4_le_test]:
+//               ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+//     test_bop!(i64x2[i64; 2] | le[i64x2_le_test]: ([0, 2], [0, 1]) => [-1, 0]);
+//     test_bop!(f32x4[f32; 4] => i32 | le[f32x4_le_test]:
+//               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+//     test_bop!(f64x2[f64; 2] => i64 | le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+//
+//     #[wasm_bindgen_test]
+//     fn v128_bitwise_load_store() {
+//         unsafe {
+//             let mut arr: [i32; 4] = [0, 1, 2, 3];
+//
+//             let vec = v128::load(arr.as_ptr() as *const v128);
+//             let vec = i32x4::add(vec, vec);
+//             v128::store(arr.as_mut_ptr() as *mut v128, vec);
+//
+//             assert_eq!(arr, [0, 2, 4, 6]);
+//         }
+//     }
+//
+//     test_uop!(f32x4[f32; 4] | neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+//     test_uop!(f32x4[f32; 4] | abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+//     test_bop!(f32x4[f32; 4] | min[f32x4_min_test]:
+//               ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+//     test_bop!(f32x4[f32; 4] | min[f32x4_min_test_nan]:
+//               ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+//               => [0., -3., -4., std::f32::NAN]);
+//     test_bop!(f32x4[f32; 4] | max[f32x4_max_test]:
+//               ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+//     test_bop!(f32x4[f32; 4] | max[f32x4_max_test_nan]:
+//               ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+//               => [1., -1., 7., std::f32::NAN]);
+//     test_bop!(f32x4[f32; 4] | add[f32x4_add_test]:
+//               ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+//     test_bop!(f32x4[f32; 4] | sub[f32x4_sub_test]:
+//               ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+//     test_bop!(f32x4[f32; 4] | mul[f32x4_mul_test]:
+//               ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+//     test_bop!(f32x4[f32; 4] | div[f32x4_div_test]:
+//               ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+//
+//     test_uop!(f64x2[f64; 2] | neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+//     test_uop!(f64x2[f64; 2] | abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+//     test_bop!(f64x2[f64; 2] | min[f64x2_min_test]:
+//               ([0., -1.], [1., -3.]) => [0., -3.]);
+//     test_bop!(f64x2[f64; 2] | min[f64x2_min_test_nan]:
+//               ([7., 8.], [-4., std::f64::NAN])
+//               => [ -4., std::f64::NAN]);
+//     test_bop!(f64x2[f64; 2] | max[f64x2_max_test]:
+//               ([0., -1.], [1., -3.]) => [1., -1.]);
+//     test_bop!(f64x2[f64; 2] | max[f64x2_max_test_nan]:
+//               ([7., 8.], [ -4., std::f64::NAN])
+//               => [7., std::f64::NAN]);
+//     test_bop!(f64x2[f64; 2] | add[f64x2_add_test]:
+//               ([0., -1.], [1., -3.]) => [1., -4.]);
+//     test_bop!(f64x2[f64; 2] | sub[f64x2_sub_test]:
+//               ([0., -1.], [1., -3.]) => [-1., 2.]);
+//     test_bop!(f64x2[f64; 2] | mul[f64x2_mul_test]:
+//               ([0., -1.], [1., -3.]) => [0., 3.]);
+//     test_bop!(f64x2[f64; 2] | div[f64x2_div_test]:
+//               ([0., -8.], [1., 4.]) => [0., -2.]);
+//
+//     macro_rules! test_conv {
+//         ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+//             #[wasm_bindgen_test]
+//             fn $test_id() {
+//                 unsafe {
+//                     let from: v128 = mem::transmute($from);
+//                     let to: v128 = mem::transmute($to);
+//
+//                     let r: v128 = $to_ty::$conv_id(from);
+//
+//                     compare_bytes(r, to);
+//                 }
+//             }
+//         };
+//     }
+//
+//     test_conv!(
+//         f32x4_convert_s_i32x4 | convert_s_i32x4 | f32x4 | [1_i32, 2, 3, 4],
+//         [1_f32, 2., 3., 4.]
+//     );
+//     test_conv!(
+//         f32x4_convert_u_i32x4
+//             | convert_u_i32x4
+//             | f32x4
+//             | [u32::max_value(), 2, 3, 4],
+//         [u32::max_value() as f32, 2., 3., 4.]
+//     );
+//     test_conv!(
+//         f64x2_convert_s_i64x2 | convert_s_i64x2 | f64x2 | [1_i64, 2],
+//         [1_f64, 2.]
+//     );
+//     test_conv!(
+//         f64x2_convert_u_i64x2
+//             | convert_u_i64x2
+//             | f64x2
+//             | [u64::max_value(), 2],
+//         [18446744073709552000.0, 2.]
+//     );
+//
+//     // FIXME: this fails, and produces -2147483648 instead of saturating at
+//     // i32::max_value() test_conv!(i32x4_trunc_s_f32x4_sat | trunc_s_f32x4_sat
+//     // | i32x4 | [1_f32, 2., (i32::max_value() as f32 + 1.), 4.],
+//     // [1_i32, 2, i32::max_value(), 4]); FIXME: add other saturating tests
+// }
diff --git a/library/stdarch/crates/core_arch/src/x86/abm.rs b/library/stdarch/crates/core_arch/src/x86/abm.rs
new file mode 100644
index 00000000000..b763902211b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//! System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u32)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
+    x.leading_zeros()
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt32)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u32() {
+        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt32() {
+        assert_eq!(_popcnt32(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/adx.rs b/library/stdarch/crates/core_arch/src/x86/adx.rs
new file mode 100644
index 00000000000..c59743980f2
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/adx.rs
@@ -0,0 +1,46 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.u32"]
+    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+    #[link_name = "llvm.x86.subborrow.u32"]
+    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+}
+
+/// Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry flag), and store the unsigned 32-bit result in out, and the carry-out
+/// is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_addcarry_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry or overflow flag), and store the unsigned 32-bit result in out, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+#[cfg(not(stage0))]
+pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    _addcarry_u32(c_in, a, b, out)
+}
+
+/// Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry or overflow flag), and store the unsigned 32-bit result in out, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_subborrow_u32(c_in, a, b);
+    *out = b;
+    a
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/aes.rs b/library/stdarch/crates/core_arch/src/x86/aes.rs
new file mode 100644
index 00000000000..c90bb122d5a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/aes.rs
@@ -0,0 +1,175 @@
+//! AES New Instructions (AES-NI)
+//!
+//! The intrinsics here correspond to those in the `wmmintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.aesni.aesdec"]
+    fn aesdec(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesdeclast"]
+    fn aesdeclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenc"]
+    fn aesenc(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenclast"]
+    fn aesenclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesimc"]
+    fn aesimc(a: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aeskeygenassist"]
+    fn aeskeygenassist(a: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Perform one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdec_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesdec(a, round_key)
+}
+
+/// Perform the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdeclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesdeclast(a, round_key)
+}
+
+/// Perform one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesenc(a, round_key)
+}
+
+/// Perform the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesenclast(a, round_key)
+}
+
+/// Perform the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesimc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
+    aesimc(a)
+}
+
+/// Assist in expanding the AES cipher key.
+///
+/// Assist in expanding the AES cipher key by computing steps towards
+/// generating a round key for encryption cipher using data from `a` and an
+/// 8-bit round constant `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aeskeygenassist_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aeskeygenassist, imm8 = 0))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aeskeygenassist_si128(a: __m128i, imm8: i32) -> __m128i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            aeskeygenassist(a, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdec_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let r = _mm_aesdec_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdeclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let r = _mm_aesdeclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let r = _mm_aesenc_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let r = _mm_aesenclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesimc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714195.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0xc66c82284ee40aa0, 0x6633441122770055);
+        let r = _mm_aesimc_si128(a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aeskeygenassist_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
+        let r = _mm_aeskeygenassist_si128(a, 5);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
new file mode 100644
index 00000000000..9a350f6eab7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -0,0 +1,5060 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//! Programmer's Manual, Volume 3: General-Purpose and System
+//! Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use intrinsics;
+use mem;
+use ptr;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Add packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_add(a, b)
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+    simd_add(a, b)
+}
+
+/// Compute the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: Should be 'vandpd' instuction.
+// See https://github.com/rust-lang-nursery/stdsimd/issues/71
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute(simd_and(a, b))
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute(simd_and(a, b))
+}
+
+/// Compute the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: Should be 'vorpd' instuction.
+// See https://github.com/rust-lang-nursery/stdsimd/issues/71
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute(simd_or(a, b))
+}
+
+/// Compute the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute(simd_or(a, b))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d]);
+        };
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 6),
+                _ => shuffle4!($a, $b, $c, 7),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 4),
+                _ => shuffle2!($a, 5),
+            }
+        };
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
+        };
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
+            match (imm8 >> 6) & 0x3 {
+                0 => shuffle4!($a, $b, $c, 8, $e, $f, $g, 12),
+                1 => shuffle4!($a, $b, $c, 9, $e, $f, $g, 13),
+                2 => shuffle4!($a, $b, $c, 10, $e, $f, $g, 14),
+                _ => shuffle4!($a, $b, $c, 11, $e, $f, $g, 15),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr, $e:expr, $f:expr) => {
+            match (imm8 >> 4) & 0x3 {
+                0 => shuffle3!($a, $b, 8, $e, $f, 12),
+                1 => shuffle3!($a, $b, 9, $e, $f, 13),
+                2 => shuffle3!($a, $b, 10, $e, $f, 14),
+                _ => shuffle3!($a, $b, 11, $e, $f, 15),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $e:expr) => {
+            match (imm8 >> 2) & 0x3 {
+                0 => shuffle2!($a, 0, $e, 4),
+                1 => shuffle2!($a, 1, $e, 5),
+                2 => shuffle2!($a, 2, $e, 6),
+                _ => shuffle2!($a, 3, $e, 7),
+            }
+        };
+    }
+    match imm8 & 0x3 {
+        0 => shuffle1!(0, 4),
+        1 => shuffle1!(1, 5),
+        2 => shuffle1!(2, 6),
+        _ => shuffle1!(3, 7),
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: Should be 'vandnpd' instruction.
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and return packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+    maxpd256(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and return packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+    maxps256(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and return packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+    minpd256(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and return packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+    minps256(a, b)
+}
+
+/// Add packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_mul(a, b)
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+    simd_mul(a, b)
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    addsubpd256(a, b)
+}
+
+/// Alternatively add and subtract packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+    addsubps256(a, b)
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_sub(a, b)
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+    simd_sub(a, b)
+}
+
+/// Compute the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+    simd_div(a, b)
+}
+
+/// Compute the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_div(a, b)
+}
+
+/// Round packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `b`. The value of `b` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd, b = 0x3))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            roundpd256(a, $imm8)
+        };
+    }
+    constify_imm8!(b, call)
+}
+
+/// Round packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+    roundpd256(a, 0x02)
+}
+
+/// Round packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
+    roundpd256(a, 0x01)
+}
+
+/// Round packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `b`. The value of `b` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            roundps256(a, $imm8)
+        };
+    }
+    constify_imm8!(b, call)
+}
+
+/// Round packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
+    roundps256(a, 0x02)
+}
+
+/// Round packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
+    roundps256(a, 0x01)
+}
+
+/// Return the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+    sqrtps256(a)
+}
+
+/// Return the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+    sqrtpd256(a)
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// Note: LLVM7 prefers single-precision blend instructions when
+// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
+// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! blend4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d]);
+        };
+    }
+    macro_rules! blend3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match imm8 & 0x8 {
+                0 => blend4!($a, $b, $c, 3),
+                _ => blend4!($a, $b, $c, 7),
+            }
+        };
+    }
+    macro_rules! blend2 {
+        ($a:expr, $b:expr) => {
+            match imm8 & 0x4 {
+                0 => blend3!($a, $b, 2),
+                _ => blend3!($a, $b, 6),
+            }
+        };
+    }
+    macro_rules! blend1 {
+        ($a:expr) => {
+            match imm8 & 0x2 {
+                0 => blend2!($a, 1),
+                _ => blend2!($a, 5),
+            }
+        };
+    }
+    match imm8 & 0x1 {
+        0 => blend1!(0),
+        _ => blend1!(4),
+    }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! blend4 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
+        };
+    }
+    macro_rules! blend3 {
+        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
+                0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
+                0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
+                _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
+            }
+        };
+    }
+    macro_rules! blend2 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => blend3!($a, $b, $c, $d, 4, 5),
+                0b01 => blend3!($a, $b, $c, $d, 12, 5),
+                0b10 => blend3!($a, $b, $c, $d, 4, 13),
+                _ => blend3!($a, $b, $c, $d, 12, 13),
+            }
+        };
+    }
+    macro_rules! blend1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => blend2!($a, $b, 2, 3),
+                0b01 => blend2!($a, $b, 10, 3),
+                0b10 => blend2!($a, $b, 2, 11),
+                _ => blend2!($a, $b, 10, 11),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => blend1!(0, 1),
+        0b01 => blend1!(8, 1),
+        0b10 => blend1!(0, 9),
+        _ => blend1!(8, 9),
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vblendvpd(a, b, c)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vblendvps(a, b, c)
+}
+
+/// Conditionally multiply the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vdpps(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+    vhaddpd(a, b)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+    vhaddps(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    vhsubpd(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+    vhsubps(a, b)
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME Should be 'vxorpd' instruction.
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute(simd_xor(a, b))
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute(simd_xor(a, b))
+}
+
+/// Equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_US: i32 = 0x1f;
+
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmppd(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmppd256(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmpps(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmpps256(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the lower element of returned vector,
+/// and copy the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmpsd(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the lower element of returned vector,
+/// and copy the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcmpss(a, b, $imm8)
+        };
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    simd_cast(a.as_i32x4())
+}
+
+/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    vcvtdq2ps(a.as_i32x8())
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    vcvtpd2ps(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+    mem::transmute(vcvtps2dq(a))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    simd_cast(a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+    mem::transmute(vcvttpd2dq(a))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+    mem::transmute(vcvtpd2dq(a))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+    mem::transmute(vcvttps2dq(a))
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, imm8 = 1)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+    }
+}
+
+/// Extract 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, imm8 = 1)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
+    match imm8 & 1 {
+        0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
+        _ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
+    }
+}
+
+/// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, imm8 = 1)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
+    let b = _mm256_undefined_si256().as_i64x4();
+    let dst: i64x2 = match imm8 & 1 {
+        0 => simd_shuffle2(a.as_i64x4(), b, [0, 1]),
+        _ => simd_shuffle2(a.as_i64x4(), b, [2, 3]),
+    };
+    mem::transmute(dst)
+}
+
+/// Zero the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroall)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroall))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zeroall() {
+    vzeroall()
+}
+
+/// Zero the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroupper)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroupper))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zeroupper() {
+    vzeroupper()
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+    vpermilps256(a, b.as_i32x8())
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+    vpermilps(a, b.as_i32x4())
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle8(
+                a,
+                _mm256_undefined_ps(),
+                [$a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4],
+            )
+        };
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_ps)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm_undefined_ps(), [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+    vpermilpd256(a, b.as_i64x4())
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+    vpermilpd(a, b.as_i64x2())
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm256_undefined_pd(), [$a, $b, $c, $d]);
+        };
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        };
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_pd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            simd_shuffle2(a, _mm_undefined_pd(), [$a, $b]);
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        };
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vperm2f128ps256(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vperm2f128pd256(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffle 258-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vperm2f128si256(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8!(imm8, call);
+    mem::transmute(r)
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::trivially_copy_pass_by_ref))]
+pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::trivially_copy_pass_by_ref))]
+pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
+    _mm_set1_ps(*f)
+}
+
+/// Broadcast a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_sd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::trivially_copy_pass_by_ref))]
+pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+    _mm256_set1_pd(*f)
+}
+
+/// Broadcast 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    vbroadcastf128ps256(a)
+}
+
+/// Broadcast 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    vbroadcastf128pd256(a)
+}
+
+/// Copy `a` to result, then insert 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, imm8 = 1)
+)]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
+    let b = _mm256_castps128_ps256(b);
+    match imm8 & 1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
+/// Copy `a` to result, then insert 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, imm8 = 1)
+)]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d {
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
+    }
+}
+
+/// Copy `a` to result, then insert 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, imm8 = 1)
+)]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
+    let b = _mm256_castsi128_si256(b).as_i64x4();
+    let dst: i64x4 = match imm8 & 1 {
+        0 => simd_shuffle4(a.as_i64x4(), b, [4, 5, 2, 3]),
+        _ => simd_shuffle4(a.as_i64x4(), b, [0, 1, 4, 5]),
+    };
+    mem::transmute(dst)
+}
+
+/// Copy `a` to result, and insert the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
+    mem::transmute(simd_insert(a.as_i8x32(), (index as u32) & 31, i))
+}
+
+/// Copy `a` to result, and insert the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
+    mem::transmute(simd_insert(a.as_i16x16(), (index as u32) & 15, i))
+}
+
+/// Copy `a` to result, and insert the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
+    mem::transmute(simd_insert(a.as_i32x8(), (index as u32) & 7, i))
+}
+
+/// Load 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
+    *(mem_addr as *const __m256d)
+}
+
+/// Store 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
+    *(mem_addr as *mut __m256d) = a;
+}
+
+/// Load 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
+    *(mem_addr as *const __m256)
+}
+
+/// Store 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
+    *(mem_addr as *mut __m256) = a;
+}
+
+/// Load 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
+    let mut dst = _mm256_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256d as *mut u8,
+        mem::size_of::<__m256d>(),
+    );
+    dst
+}
+
+/// Store 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
+    storeupd256(mem_addr, a);
+}
+
+/// Load 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
+    let mut dst = _mm256_undefined_ps();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256 as *mut u8,
+        mem::size_of::<__m256>(),
+    );
+    dst
+}
+
+/// Store 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
+    storeups256(mem_addr, a);
+}
+
+/// Load 256-bits of integer data from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
+    *mem_addr
+}
+
+/// Store 256-bits of integer data from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
+    *mem_addr = a;
+}
+
+/// Load 256-bits of integer data from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
+    let mut dst = _mm256_undefined_si256();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256i as *mut u8,
+        mem::size_of::<__m256i>(),
+    );
+    dst
+}
+
+/// Store 256-bits of integer data from `a` into memory.
+/// 	`mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
+    storeudq256(mem_addr as *mut i8, a.as_i8x32());
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
+    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
+    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
+    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
+    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
+    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
+    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
+    maskloadps(mem_addr as *const i8, mask.as_i32x4())
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
+    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movehdup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    simd_shuffle8(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_moveldup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from "a", and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movedup_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    simd_shuffle4(a, a, [0, 0, 2, 2])
+}
+
+/// Load 256-bits of integer data from unaligned memory into result.
+/// This intrinsic may perform better than `_mm256_loadu_si256` when the
+/// data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lddqu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vlddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
+    mem::transmute(vlddqu(mem_addr as *const i8))
+}
+
+/// Moves integer data from a 256-bit integer vector to a 32-byte
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Moves double-precision values from a 256-bit vector of `[4 x double]`
+/// to a 32-byte aligned memory location. To minimize caching, the data is
+/// flagged as non-temporal (unlikely to be used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
+}
+
+/// Moves single-precision floating point values from a 256-bit vector
+/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
+/// caching, the data is flagged as non-temporal (unlikely to be used again
+/// soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and return the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
+    vrcpps(a)
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and return the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+    vrsqrtps(a)
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_shuffle4(a, b, [1, 5, 3, 7])
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_shuffle4(a, b, [0, 4, 2, 6])
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestz256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestnzc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestzpd256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestcpd256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestnzcpd256(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestzpd(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestcpd(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestnzcpd(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+    vtestzps256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+    vtestcps256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+    vtestnzcps256(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+    vtestzps(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+    vtestcps(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+    vtestnzcps(a, b)
+}
+
+/// Set each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    movmskpd256(a)
+}
+
+/// Set each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
+    movmskps256(a)
+}
+
+/// Return vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_pd() -> __m256d {
+    _mm256_set1_pd(0.0)
+}
+
+/// Return vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_ps() -> __m256 {
+    _mm256_set1_ps(0.0)
+}
+
+/// Return vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_si256() -> __m256i {
+    _mm256_set1_epi8(0)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+
+/// Set packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        e31, e30, e29, e28, e27, e26, e25, e24,
+        e23, e22, e21, e20, e19, e18, e17, e16,
+        e15, e14, e13, e12, e11, e10, e09, e08,
+        e07, e06, e05, e04, e03, e02, e01, e00,
+    )
+}
+
+/// Set packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi16(
+        e15, e14, e13, e12,
+        e11, e10, e09, e08,
+        e07, e06, e05, e04,
+        e03, e02, e01, e00,
+    )
+}
+
+/// Set packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Set packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    _mm256_setr_epi64x(d, c, b, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    __m256d(a, b, c, d)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    __m256(a, b, c, d, e, f, g, h)
+}
+
+/// Set packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    #[rustfmt::skip]
+    mem::transmute(i8x32::new(
+        e00, e01, e02, e03, e04, e05, e06, e07,
+        e08, e09, e10, e11, e12, e13, e14, e15,
+        e16, e17, e18, e19, e20, e21, e22, e23,
+        e24, e25, e26, e27, e28, e29, e30, e31,
+    ))
+}
+
+/// Set packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    #[rustfmt::skip]
+    mem::transmute(i16x16::new(
+        e00, e01, e02, e03,
+        e04, e05, e06, e07,
+        e08, e09, e10, e11,
+        e12, e13, e14, e15,
+    ))
+}
+
+/// Set packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    mem::transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+
+/// Set packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    mem::transmute(i64x4::new(a, b, c, d))
+}
+
+/// Broadcast double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+
+/// Broadcast single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcast 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastb`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+    )
+}
+
+/// Broadcast 16-bit integer `a` to all all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Broadcast 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcast 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(vmovddup))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    _mm256_setr_epi64x(a, a, a, a)
+}
+
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    mem::transmute(a)
+}
+
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps256_ps128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    simd_shuffle4(a, a, [0, 1, 2, 3])
+}
+
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd256_pd128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    simd_shuffle2(a, a, [0, 1])
+}
+
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_si128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    let a = a.as_i64x4();
+    let dst: i64x2 = simd_shuffle2(a, a, [0, 1]);
+    mem::transmute(dst)
+}
+
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps128_ps256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+}
+
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    simd_shuffle4(a, a, [0, 1, 0, 0])
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    let a = a.as_i64x2();
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    let dst: i64x4 = simd_shuffle4(a, a, [0, 1, 0, 0]);
+    mem::transmute(dst)
+}
+
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextps128_ps256)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextsi128_si256)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    let b = _mm_setzero_si128().as_i64x2();
+    let dst: i64x4 = simd_shuffle4(a.as_i64x2(), b, [0, 1, 2, 3]);
+    mem::transmute(dst)
+}
+
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
+}
+
+/// Return vector of type `__m256` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_ps() -> __m256 {
+    // FIXME: this function should return MaybeUninit<__m256>
+    mem::MaybeUninit::<__m256>::uninitialized().into_inner()
+}
+
+/// Return vector of type `__m256d` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_pd() -> __m256d {
+    // FIXME: this function should return MaybeUninit<__m256d>
+    mem::MaybeUninit::<__m256d>::uninitialized().into_inner()
+}
+
+/// Return vector of type __m256i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_si256() -> __m256i {
+    // FIXME: this function should return MaybeUninit<__m256i>
+    mem::MaybeUninit::<__m256i>::uninitialized().into_inner()
+}
+
+/// Set packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Set packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    let hi: __m128 = mem::transmute(hi);
+    let lo: __m128 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Set packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    let hi: __m128 = mem::transmute(hi);
+    let lo: __m128 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Set packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+
+/// Set packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+
+/// Set packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+
+/// Load two 128-bit values (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
+    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
+    _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
+}
+
+/// Load two 128-bit values (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128d)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
+    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
+    _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
+}
+
+/// Load two 128-bit values (composed of integer data) from memory, and combine
+/// them into a 256-bit value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128i)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
+    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
+    _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
+}
+
+/// Store the high and low 128-bit halves (each composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
+    let lo = _mm256_castps256_ps128(a);
+    _mm_storeu_ps(loaddr, lo);
+    let hi = _mm256_extractf128_ps(a, 1);
+    _mm_storeu_ps(hiaddr, hi);
+}
+
+/// Store the high and low 128-bit halves (each composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128d)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
+    let lo = _mm256_castpd256_pd128(a);
+    _mm_storeu_pd(loaddr, lo);
+    let hi = _mm256_extractf128_pd(a, 1);
+    _mm_storeu_pd(hiaddr, hi);
+}
+
+/// Store the high and low 128-bit halves (each composed of integer data) from
+/// `a` into memory two different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128i)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
+    let lo = _mm256_castsi256_si128(a);
+    _mm_storeu_si128(loaddr, lo);
+    let hi = _mm256_extractf128_si256(a, 1);
+    _mm_storeu_si128(hiaddr, hi);
+}
+
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(movss))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    simd_extract(a, 0)
+}
+
+/// LLVM intrinsics used in the above functions
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx.addsub.pd.256"]
+    fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.addsub.ps.256"]
+    fn addsubps256(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.max.pd.256"]
+    fn maxpd256(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.max.ps.256"]
+    fn maxps256(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.min.pd.256"]
+    fn minpd256(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.min.ps.256"]
+    fn minps256(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.round.pd.256"]
+    fn roundpd256(a: __m256d, b: i32) -> __m256d;
+    #[link_name = "llvm.x86.avx.round.ps.256"]
+    fn roundps256(a: __m256, b: i32) -> __m256;
+    #[link_name = "llvm.x86.avx.sqrt.pd.256"]
+    fn sqrtpd256(a: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.sqrt.ps.256"]
+    fn sqrtps256(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.blendv.pd.256"]
+    fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.blendv.ps.256"]
+    fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.dp.ps.256"]
+    fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
+    #[link_name = "llvm.x86.avx.hadd.pd.256"]
+    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hadd.ps.256"]
+    fn vhaddps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.hsub.pd.256"]
+    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hsub.ps.256"]
+    fn vhsubps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn vcmppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.avx.cmp.pd.256"]
+    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn vcmpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.avx.cmp.ps.256"]
+    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
+    fn vcvtdq2ps(a: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
+    fn vcvtpd2ps(a: __m256d) -> __m128;
+    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
+    fn vcvtps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
+    fn vcvttpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
+    fn vcvtpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
+    fn vcvttps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.vzeroall"]
+    fn vzeroall();
+    #[link_name = "llvm.x86.avx.vzeroupper"]
+    fn vzeroupper();
+    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
+    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
+    fn vpermilps(a: __m128, b: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
+    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
+    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
+    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
+    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
+    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
+    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
+    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
+    fn vbroadcastf128ps256(a: &__m128) -> __m256;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
+    fn vbroadcastf128pd256(a: &__m128d) -> __m256d;
+    #[link_name = "llvm.x86.avx.storeu.pd.256"]
+    fn storeupd256(mem_addr: *mut f64, a: __m256d);
+    #[link_name = "llvm.x86.avx.storeu.ps.256"]
+    fn storeups256(mem_addr: *mut f32, a: __m256);
+    #[link_name = "llvm.x86.avx.storeu.dq.256"]
+    fn storeudq256(mem_addr: *mut i8, a: i8x32);
+    #[link_name = "llvm.x86.avx.maskload.pd.256"]
+    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
+    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
+    #[link_name = "llvm.x86.avx.maskload.pd"]
+    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.maskstore.pd"]
+    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
+    #[link_name = "llvm.x86.avx.maskload.ps.256"]
+    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
+    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
+    #[link_name = "llvm.x86.avx.maskload.ps"]
+    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.maskstore.ps"]
+    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
+    #[link_name = "llvm.x86.avx.ldu.dq.256"]
+    fn vlddqu(mem_addr: *const i8) -> i8x32;
+    #[link_name = "llvm.x86.avx.rcp.ps.256"]
+    fn vrcpps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
+    fn vrsqrtps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.ptestz.256"]
+    fn ptestz256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestc.256"]
+    fn ptestc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestnzc.256"]
+    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
+    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
+    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
+    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd"]
+    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd"]
+    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
+    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
+    fn vtestzps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
+    fn vtestcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
+    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps"]
+    fn vtestzps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps"]
+    fn vtestcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
+    fn vtestnzcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.pd.256"]
+    fn movmskpd256(a: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.ps.256"]
+    fn movmskps256(a: __m256) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+    use test::black_box; // Used to inhibit constant-folding.
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_add_pd(a, b);
+        let e = _mm256_setr_pd(6., 8., 10., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_add_ps(a, b);
+        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_and_pd(a, b);
+        let e = _mm256_set1_pd(0.5);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_and_ps(a, b);
+        let e = _mm256_set1_ps(0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_or_pd(a, b);
+        let e = _mm256_set1_pd(1.2);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_or_ps(a, b);
+        let e = _mm256_set1_ps(1.2);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_pd(a, b, 0xF);
+        let e = _mm256_setr_pd(4., 3., 8., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_ps(a, b, 0x0F);
+        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_pd() {
+        let a = _mm256_set1_pd(0.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_andnot_pd(a, b);
+        assert_eq_m256d(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_ps() {
+        let a = _mm256_set1_ps(0.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_andnot_ps(a, b);
+        assert_eq_m256(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_max_pd(a, b);
+        let e = _mm256_setr_pd(2., 4., 6., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_max_ps(a, b);
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_min_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 5., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_min_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_mul_pd(a, b);
+        let e = _mm256_setr_pd(5., 12., 21., 32.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_mul_ps(a, b);
+        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_addsub_pd(a, b);
+        let e = _mm256_setr_pd(-4., 8., -4., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_addsub_ps(a, b);
+        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_sub_pd(a, b);
+        let e = _mm256_setr_pd(-4., -4., -4., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
+        let r = _mm256_sub_ps(a, b);
+        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_pd(a, 0b00000000);
+        let result_down = _mm256_round_pd(a, 0b00000001);
+        let result_up = _mm256_round_pd(a, 0b00000010);
+        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_closest, expected_closest);
+        assert_eq_m256d(result_down, expected_down);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_pd(a);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        assert_eq_m256d(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_pd(a);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_ps(a, 0b00000000);
+        let result_down = _mm256_round_ps(a, 0b00000001);
+        let result_up = _mm256_round_ps(a, 0b00000010);
+        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_closest, expected_closest);
+        assert_eq_m256(result_down, expected_down);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_ps(a);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        assert_eq_m256(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_ps(a);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_sqrt_pd(a);
+        let e = _mm256_setr_pd(2., 3., 4., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_sqrt_ps(a);
+        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_div_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_div_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 8., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_blend_pd(a, b, 0x0);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
+        let r = _mm256_blend_pd(a, b, 0x3);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
+        let r = _mm256_blend_pd(a, b, 0xF);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_blend_ps(a, b, 0x0);
+        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps(a, b, 0x3);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps(a, b, 0xF);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
+        let r = _mm256_blendv_pd(a, b, c);
+        let e = _mm256_setr_pd(4., 9., 2., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        #[rustfmt::skip]
+        let c = _mm256_setr_ps(
+            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
+        );
+        let r = _mm256_blendv_ps(a, b, c);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_dp_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_dp_ps(a, b, 0xFF);
+        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(13., 7., 41., 7.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(3., 11., 7., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-5., 1., -9., -3.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-1., -1., -1., -1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_set1_pd(0.);
+        let r = _mm256_xor_pd(a, b);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_set1_ps(0.);
+        let r = _mm256_xor_ps(a, b);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_pd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_pd(a, b, _CMP_GE_OS);
+        assert!(get_m128d(r, 0).is_nan());
+        assert!(get_m128d(r, 1).is_nan());
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_cmp_pd(a, b, _CMP_GE_OS);
+        let e = _mm256_set1_pd(0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ps(a, b, _CMP_GE_OS);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 0.);
+        assert_eq!(get_m128(r, 2), 0.);
+        assert_eq!(get_m128(r, 3), 0.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_cmp_ps(a, b, _CMP_GE_OS);
+        let e = _mm256_set1_ps(0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_sd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_sd(a, b, _CMP_GE_OS);
+        assert!(get_m128d(r, 0).is_nan());
+        assert_eq!(get_m128d(r, 1), 9.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ss() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ss(a, b, _CMP_GE_OS);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 3.);
+        assert_eq!(get_m128(r, 2), 2.);
+        assert_eq!(get_m128(r, 3), 5.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_pd() {
+        let a = _mm_setr_epi32(4, 9, 16, 25);
+        let r = _mm256_cvtepi32_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_ps() {
+        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        let r = _mm256_cvtepi32_ps(a);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_ps() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_ps(a);
+        let e = _mm_setr_ps(4., 9., 16., 25.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvtps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_pd() {
+        let a = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_cvtps_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvttpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvttps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_extractf128_ps(a, 0);
+        let e = _mm_setr_ps(4., 3., 2., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_extractf128_pd(a, 0);
+        let e = _mm_setr_pd(4., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_si256() {
+        let a = _mm256_setr_epi64x(4, 3, 2, 5);
+        let r = _mm256_extractf128_si256(a, 0);
+        let e = _mm_setr_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zeroall() {
+        _mm256_zeroall();
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zeroupper() {
+        _mm256_zeroupper();
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_permutevar_ps(a, b);
+        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_permutevar_ps(a, b);
+        let e = _mm_setr_ps(3., 2., 5., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_permute_ps(a, 0x1b);
+        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm_permute_ps(a, 0x1b);
+        let e = _mm_setr_ps(5., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let b = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_permutevar_pd(a, b);
+        let e = _mm256_setr_pd(4., 3., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let b = _mm_setr_epi64x(3, 0);
+        let r = _mm_permutevar_pd(a, b);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_permute_pd(a, 5);
+        let e = _mm256_setr_pd(3., 4., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm_permute_pd(a, 1);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_permute2f128_ps(a, b, 0x13);
+        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_permute2f128_pd(a, b, 0x31);
+        let e = _mm256_setr_pd(3., 4., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_si256() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = _mm256_permute2f128_si256(a, b, 0x20);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ss() {
+        let r = _mm256_broadcast_ss(&3.);
+        let e = _mm256_set1_ps(3.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_broadcast_ss() {
+        let r = _mm_broadcast_ss(&3.);
+        let e = _mm_set1_ps(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_sd() {
+        let r = _mm256_broadcast_sd(&3.);
+        let e = _mm256_set1_pd(3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm256_broadcast_ps(&a);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm256_broadcast_pd(&a);
+        let e = _mm256_setr_pd(4., 3., 4., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_insertf128_ps(a, b, 0);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm256_insertf128_pd(a, b, 0);
+        let e = _mm256_setr_pd(5., 6., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(5, 6);
+        let r = _mm256_insertf128_si256(a, b, 0);
+        let e = _mm256_setr_epi64x(5, 6, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_insert_epi8(a, 0, 31);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_insert_epi16(a, 0, 15);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_insert_epi32(a, 0, 7);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let p = &a as *const _ as *const f64;
+        let r = _mm256_load_pd(p);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_store_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let p = &a as *const _ as *const f32;
+        let r = _mm256_load_ps(p);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_store_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_pd(black_box(p));
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_pd() {
+        let a = _mm256_set1_pd(9.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_ps() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_ps(black_box(p));
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_ps() {
+        let a = _mm256_set1_ps(9.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = &a as *const _;
+        let r = _mm256_load_si256(p);
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = &a as *const _;
+        let r = _mm256_loadu_si256(black_box(p));
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_si256() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_pd(black_box(p), mask);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_pd() {
+        let mut r = _mm256_set1_pd(0.);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(black_box(p), mask);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_pd() {
+        let mut r = _mm_set1_pd(0.);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_ps(black_box(p), mask);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_ps() {
+        let mut r = _mm256_set1_ps(0.);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_ps(black_box(p), mask);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_ps() {
+        let mut r = _mm_set1_ps(0.);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movehdup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_movehdup_ps(a);
+        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_moveldup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_moveldup_ps(a);
+        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movedup_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_movedup_pd(a);
+        let e = _mm256_setr_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_lddqu_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let p = &a as *const _;
+        let r = _mm256_lddqu_si256(black_box(p));
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_stream_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_pd() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f64; 4],
+        }
+        let a = _mm256_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m256d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_ps() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f32; 8],
+        }
+        let a = _mm256_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m256(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rcp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rcp_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.49987793, 0.33325195, 0.24993896,
+            0.19995117, 0.16662598, 0.14282227, 0.12496948,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rsqrt_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rsqrt_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.7069092, 0.5772705, 0.49987793,
+            0.44714355, 0.40820313, 0.3779297, 0.3534546,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpackhi_pd(a, b);
+        let e = _mm256_setr_pd(2., 6., 4., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpackhi_ps(a, b);
+        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpacklo_pd(a, b);
+        let e = _mm256_setr_pd(1., 5., 3., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpacklo_ps(a, b);
+        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_setr_epi64x(0, 0, 0, 0);
+        let b = _mm256_setr_epi64x(0, 0, 0, 0);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(-1.);
+        let r = _mm256_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(-1.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_pd(1., -1., -1., -1.);
+        let b = _mm256_setr_pd(-1., -1., 1., 1.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(-1.);
+        let r = _mm_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(-1.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm_setr_pd(1., -1.);
+        let b = _mm_setr_pd(-1., -1.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_ps(-1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm256_set1_ps(-1.);
+        let r = _mm256_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
+        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm256_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_ps(-1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm_set1_ps(-1.);
+        let r = _mm_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm_setr_ps(1., -1., -1., -1.);
+        let b = _mm_setr_ps(-1., -1., 1., 1.);
+        let r = _mm_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_pd() {
+        let a = _mm256_setr_pd(1., -2., 3., -4.);
+        let r = _mm256_movemask_pd(a);
+        assert_eq!(r, 0xA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_ps() {
+        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
+        let r = _mm256_movemask_ps(a);
+        assert_eq!(r, 0xAA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_pd() {
+        let r = _mm256_setzero_pd();
+        assert_eq_m256d(r, _mm256_set1_pd(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_ps() {
+        let r = _mm256_setzero_ps();
+        assert_eq_m256(r, _mm256_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_si256() {
+        let r = _mm256_setzero_si256();
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_pd() {
+        let r = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_ps() {
+        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25,
+            24, 23, 22, 21, 20, 19, 18, 17,
+            16, 15, 14, 13, 12, 11, 10, 9,
+            8, 7, 6, 5, 4, 3, 2, 1
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            16, 15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi32() {
+        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi64x() {
+        let r = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_pd() {
+        let r = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_ps() {
+        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi32() {
+        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi64x() {
+        let r = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_pd() {
+        let r = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, _mm256_set1_pd(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_ps() {
+        let r = _mm256_set1_ps(1.);
+        assert_eq_m256(r, _mm256_set1_ps(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi8() {
+        let r = _mm256_set1_epi8(1);
+        assert_eq_m256i(r, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi16() {
+        let r = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, _mm256_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi32() {
+        let r = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, _mm256_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi64x() {
+        let r = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, _mm256_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_ps() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_ps(a);
+        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_pd() {
+        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        let r = _mm256_castps_pd(a);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_si256() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps_si256(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_ps() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        let r = _mm256_castsi256_ps(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_si256() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_si256(a);
+        assert_eq_m256d(mem::transmute(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_pd() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_pd(a);
+        assert_eq_m256d(r, mem::transmute(a));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps256_ps128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps256_ps128(a);
+        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd256_pd128() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd256_pd128(a);
+        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_si128() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_si128(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextps128_ps256() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_zextps128_ps256(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextsi128_si256() {
+        let a = _mm_setr_epi64x(1, 2);
+        let r = _mm256_zextsi128_si256(a);
+        let e = _mm256_setr_epi64x(1, 2, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextpd128_pd256() {
+        let a = _mm_setr_pd(1., 2.);
+        let r = _mm256_zextpd128_pd256(a);
+        let e = _mm256_setr_pd(1., 2., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128() {
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_set_m128(hi, lo);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128d() {
+        let hi = _mm_setr_pd(3., 4.);
+        let lo = _mm_setr_pd(1., 2.);
+        let r = _mm256_set_m128d(hi, lo);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm256_set_m128i(hi, lo);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128() {
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let r = _mm256_setr_m128(lo, hi);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128d() {
+        let lo = _mm_setr_pd(1., 2.);
+        let hi = _mm_setr_pd(3., 4.);
+        let r = _mm256_setr_m128d(lo, hi);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128i() {
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_setr_m128i(lo, hi);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128() {
+        let hi = &[5., 6., 7., 8.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2., 3., 4.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128(hiaddr, loaddr);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128d() {
+        let hi = &[3., 4.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm256_loadu2_m128i(&hi as *const _ as *const _, &lo as *const _ as *const _);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut hi = _mm_undefined_ps();
+        let mut lo = _mm_undefined_ps();
+        _mm256_storeu2_m128(
+            &mut hi as *mut _ as *mut f32,
+            &mut lo as *mut _ as *mut f32,
+            a,
+        );
+        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
+        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128d() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut hi = _mm_undefined_pd();
+        let mut lo = _mm_undefined_pd();
+        _mm256_storeu2_m128d(
+            &mut hi as *mut _ as *mut f64,
+            &mut lo as *mut _ as *mut f64,
+            a,
+        );
+        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
+        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128i() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let mut hi = _mm_undefined_si128();
+        let mut lo = _mm_undefined_si128();
+        _mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
+        #[rustfmt::skip]
+        let e_hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+        #[rustfmt::skip]
+        let e_lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16
+        );
+
+        assert_eq_m128i(hi, e_hi);
+        assert_eq_m128i(lo, e_lo);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtss_f32() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_cvtss_f32(a);
+        assert_eq!(r, 1.);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
new file mode 100644
index 00000000000..bc4d0e12f54
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -0,0 +1,6230 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    mem::transmute(pabsd(a.as_i32x8()))
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    mem::transmute(pabsw(a.as_i16x16()))
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    mem::transmute(pabsb(a.as_i8x32()))
+}
+
+/// Add packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Add packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Add packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(paddsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Add packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(paddsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(paddusb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(paddusw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shift the result right by `n` bytes, and return the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpalignr, n = 7))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
+    let n = n as u32;
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if n > 32 {
+        return _mm256_set1_epi8(0);
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b, n) = if n > 16 {
+        (_mm256_set1_epi8(0), a, n - 16)
+    } else {
+        (a, b, n)
+    };
+
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+
+    let r: i8x32 = match n {
+        0 => simd_shuffle32(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle32(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 48,
+            ],
+        ),
+        2 => simd_shuffle32(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49,
+            ],
+        ),
+        3 => simd_shuffle32(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+            ],
+        ),
+        4 => simd_shuffle32(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+            ],
+        ),
+        5 => simd_shuffle32(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+            ],
+        ),
+        6 => simd_shuffle32(
+            b,
+            a,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+            ],
+        ),
+        7 => simd_shuffle32(
+            b,
+            a,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+            ],
+        ),
+        8 => simd_shuffle32(
+            b,
+            a,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
+                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+            ],
+        ),
+        9 => simd_shuffle32(
+            b,
+            a,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
+                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+            ],
+        ),
+        10 => simd_shuffle32(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
+                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+            ],
+        ),
+        11 => simd_shuffle32(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
+                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            ],
+        ),
+        12 => simd_shuffle32(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+            ],
+        ),
+        13 => simd_shuffle32(
+            b,
+            a,
+            [
+                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
+                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+            ],
+        ),
+        14 => simd_shuffle32(
+            b,
+            a,
+            [
+                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
+                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+            ],
+        ),
+        15 => simd_shuffle32(
+            b,
+            a,
+            [
+                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+            ],
+        ),
+        _ => b,
+    };
+    mem::transmute(r)
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compute the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    let all_ones = _mm256_set1_epi8(-1);
+    mem::transmute(simd_and(
+        simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+        b.as_i64x4(),
+    ))
+}
+
+/// Average packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Average packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    macro_rules! blend2 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d]);
+        };
+    }
+    macro_rules! blend1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => blend2!($a, $b, 2, 3),
+                0b01 => blend2!($a, $b, 6, 3),
+                0b10 => blend2!($a, $b, 2, 7),
+                _ => blend2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: i32x4 = match imm8 & 0b11 {
+        0b00 => blend1!(0, 1),
+        0b01 => blend1!(4, 1),
+        0b10 => blend1!(0, 5),
+        _ => blend1!(4, 5),
+    };
+    mem::transmute(r)
+}
+
+/// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    macro_rules! blend4 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
+        };
+    }
+    macro_rules! blend3 {
+        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
+                0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
+                0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
+                _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
+            }
+        };
+    }
+    macro_rules! blend2 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => blend3!($a, $b, $c, $d, 4, 5),
+                0b01 => blend3!($a, $b, $c, $d, 12, 5),
+                0b10 => blend3!($a, $b, $c, $d, 4, 13),
+                _ => blend3!($a, $b, $c, $d, 12, 13),
+            }
+        };
+    }
+    macro_rules! blend1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => blend2!($a, $b, 2, 3),
+                0b01 => blend2!($a, $b, 10, 3),
+                0b10 => blend2!($a, $b, 2, 11),
+                _ => blend2!($a, $b, 10, 11),
+            }
+        };
+    }
+    let r: i32x8 = match imm8 & 0b11 {
+        0b00 => blend1!(0, 1),
+        0b01 => blend1!(8, 1),
+        0b10 => blend1!(0, 9),
+        _ => blend1!(8, 9),
+    };
+    mem::transmute(r)
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    macro_rules! blend4 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr,
+            $i:expr,
+            $j:expr,
+            $k:expr,
+            $l:expr,
+            $m:expr,
+            $n:expr,
+            $o:expr,
+            $p:expr
+        ) => {
+            simd_shuffle16(
+                a,
+                b,
+                [
+                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
+                ],
+            )
+        };
+    }
+    macro_rules! blend3 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $a2:expr,
+            $b2:expr,
+            $c2:expr,
+            $d2:expr,
+            $e2:expr,
+            $f2:expr
+        ) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
+                0b01 => {
+                    blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
+                }
+                0b10 => {
+                    blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
+                }
+                _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
+            }
+        };
+    }
+    macro_rules! blend2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $a2:expr,
+            $b2:expr,
+            $c2:expr,
+            $d2:expr
+        ) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
+                0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
+                0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
+                _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
+            }
+        };
+    }
+    macro_rules! blend1 {
+        ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
+                0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
+                0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
+                _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
+            }
+        };
+    }
+    let r: i16x16 = match imm8 & 0b11 {
+        0b00 => blend1!(0, 1, 8, 9),
+        0b01 => blend1!(16, 1, 24, 9),
+        0b10 => blend1!(0, 17, 8, 25),
+        _ => blend1!(16, 17, 24, 25),
+    };
+    mem::transmute(r)
+}
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+}
+
+/// Broadcast the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+    mem::transmute::<i8x16, _>(ret)
+}
+
+/// Broadcast the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+    mem::transmute::<i8x32, _>(ret)
+}
+
+// NB: simd_shuffle4 with integer data types for `a` and `b` is
+// often compiled to vbroadcastss.
+/// Broadcast the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+    mem::transmute::<i32x4, _>(ret)
+}
+
+// NB: simd_shuffle4 with integer data types for `a` and `b` is
+// often compiled to vbroadcastss.
+/// Broadcast the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+    mem::transmute::<i32x8, _>(ret)
+}
+
+/// Broadcast the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i64x2();
+    let ret = simd_shuffle2(a.as_i64x2(), zero, [0_u32; 2]);
+    mem::transmute::<i64x2, _>(ret)
+}
+
+// NB: simd_shuffle4 with integer data types for `a` and `b` is
+// often compiled to vbroadcastsd.
+/// Broadcast the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0_u32; 4]);
+    mem::transmute::<i64x4, _>(ret)
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
+}
+
+// NB: broadcastsi128_si256 is often compiled to vinsertf128 or
+// vbroadcastf128.
+/// Broadcast 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    mem::transmute::<i64x4, _>(ret)
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+    mem::transmute::<i16x8, _>(ret)
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+    mem::transmute::<i16x16, _>(ret)
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    mem::transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    let a = a.as_i16x8();
+    let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    mem::transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    mem::transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    mem::transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    let a = a.as_i8x16();
+    let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    let a = a.as_i8x16();
+    let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    mem::transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    mem::transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    let a = a.as_u16x8();
+    let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    mem::transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    mem::transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    mem::transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    let a = a.as_u8x16();
+    let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    let a = a.as_u8x16();
+    let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    mem::transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Extract 128 bits (of integer data) from `a` selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, imm8 = 1)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
+    let a = a.as_i64x4();
+    let b = _mm256_undefined_si256().as_i64x4();
+    let dst: i64x2 = match imm8 & 0b01 {
+        0 => simd_shuffle2(a, b, [0, 1]),
+        _ => simd_shuffle2(a, b, [2, 3]),
+    };
+    mem::transmute(dst)
+}
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi32(-1).as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi32(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+    scale: i32,
+) -> __m128i {
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi32(
+    src: __m256i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m256i,
+    scale: i32,
+) -> __m256i {
+    let src = src.as_i32x8();
+    let mask = mask.as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_ps(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+    scale: i32,
+) -> __m128 {
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdps(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
+    let zero = _mm256_setzero_ps();
+    let neg_one = _mm256_set1_ps(-1.0);
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_ps(
+    src: __m256,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m256,
+    scale: i32,
+) -> __m256 {
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdps(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
+    let zero = _mm_setzero_si128().as_i64x2();
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi64(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+    scale: i32,
+) -> __m128i {
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi64(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m256i,
+    scale: i32,
+) -> __m256i {
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_pd(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+    scale: i32,
+) -> __m128d {
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherdpd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_pd(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m256d,
+    scale: i32,
+) -> __m256d {
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdpd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi32(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+    scale: i32,
+) -> __m128i {
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi32(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m128i,
+    scale: i32,
+) -> __m128i {
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_ps(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+    scale: i32,
+) -> __m128 {
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqps(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_ps(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m128,
+    scale: i32,
+) -> __m128 {
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqps(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
+    let zero = _mm_setzero_si128().as_i64x2();
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi64(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+    scale: i32,
+) -> __m128i {
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi64(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m256i,
+    mask: __m256i,
+    scale: i32,
+) -> __m256i {
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    mem::transmute(r)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_pd(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+    scale: i32,
+) -> __m128d {
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pgatherqpd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` is between 1 and 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_pd(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m256i,
+    mask: __m256d,
+    scale: i32,
+) -> __m256d {
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqpd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    constify_imm8!(scale, call)
+}
+
+/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, imm8 = 1)
+)]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    let b = _mm256_castsi128_si256(b).as_i64x4();
+    let dst: i64x4 = match imm8 & 0b01 {
+        0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
+    };
+    mem::transmute(dst)
+}
+
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Vertically multiply each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
+    mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
+}
+
+/// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
+    mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
+}
+
+/// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
+    mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
+}
+
+/// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
+    mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
+}
+
+/// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
+    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
+}
+
+/// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
+    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
+}
+
+/// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
+    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
+}
+
+/// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
+    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b`, and return the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b`, and return the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b`, and return the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b`, and return the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pminub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Create mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    pmovmskb(a.as_i8x32())
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    macro_rules! call {
+        ($imm8:expr) => {
+            mpsadbw(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8!(imm8, call);
+    mem::transmute(r)
+}
+
+/// Multiply the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Return the 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Return the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and return the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiply the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and return the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compute the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(permd(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let a = a.as_i64x4();
+    macro_rules! permute4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, zero, [$a, $b, $c, $d]);
+        };
+    }
+    macro_rules! permute3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => permute4!($a, $b, $c, 0),
+                0b01 => permute4!($a, $b, $c, 1),
+                0b10 => permute4!($a, $b, $c, 2),
+                _ => permute4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! permute2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => permute3!($a, $b, 0),
+                0b01 => permute3!($a, $b, 1),
+                0b10 => permute3!($a, $b, 2),
+                _ => permute3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! permute1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => permute2!($a, 0),
+                0b01 => permute2!($a, 1),
+                0b10 => permute2!($a, 2),
+                _ => permute2!($a, 3),
+            }
+        };
+    }
+    let r: i64x4 = match imm8 & 0b11 {
+        0b00 => permute1!(0),
+        0b01 => permute1!(1),
+        0b10 => permute1!(2),
+        _ => permute1!(3),
+    };
+    mem::transmute(r)
+}
+
+/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vperm2i128(a, b, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8, call))
+}
+
+/// Shuffle 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let undef = _mm256_undefined_pd();
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
+
+/// Shuffle eight 32-bit foating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+    permps(a, idx.as_i32x8())
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffle bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
+/// of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// The low and high halves of the vectors are shuffled separately.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffle 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+///
+/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
+/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
+///
+/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
+/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
+///
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
+    // simd_shuffleX requires that its selector parameter be made up of
+    // constant values, but we can't enforce that here. In spirit, we need
+    // to write a `match` on all possible values of a byte, and for each value,
+    // hard-code the correct `simd_shuffleX` call using only constants. We
+    // then hope for LLVM to do the rest.
+    //
+    // Of course, that's... awful. So we try to use macros to do it for us.
+    let imm8 = (imm8 & 0xFF) as u8;
+
+    let a = a.as_i32x8();
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle8(
+                a,
+                a,
+                [
+                    $x01,
+                    $x23,
+                    $x45,
+                    $x67,
+                    4 + $x01,
+                    4 + $x23,
+                    4 + $x45,
+                    4 + $x67,
+                ],
+            )
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let r: i32x8 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(r)
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i16x16();
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            #[rustfmt::skip]
+                        simd_shuffle16(a, a, [
+                            0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
+                            8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
+                        ]);
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let r: i16x16 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(r)
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i16x16();
+    macro_rules! shuffle_done {
+        ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
+            #[rustfmt::skip]
+                        simd_shuffle16(a, a, [
+                            0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
+                            8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
+                        ]);
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let r: i16x16 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(r)
+}
+
+/// Negate packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and return the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Negate packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and return the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and return the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Shift packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shift packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shift packed 16-bit integers in `a` left by `imm8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(pslliw(a.as_i16x16(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` left by `imm8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psllid(a.as_i32x8(), imm8))
+}
+
+/// Shift packed 64-bit integers in `a` left by `imm8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(pslliq(a.as_i64x4(), imm8))
+}
+
+/// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpslldq(a, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8 * 8, call))
+}
+
+/// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpslldq(a, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8 * 8, call))
+}
+
+/// Shift packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    mem::transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shift packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    mem::transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psraiw(a.as_i16x16(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psraid(a.as_i32x8(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    mem::transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpsrldq(a, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8 * 8, call))
+}
+
+/// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
+    let a = a.as_i64x4();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpsrldq(a, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8 * 8, call))
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shift packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    mem::transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psrliw(a.as_i16x16(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psrlid(a.as_i32x8(), imm8))
+}
+
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
+    mem::transmute(psrliq(a.as_i64x4(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    mem::transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shift packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    mem::transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+// TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psubsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psubsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psubusw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(psubusb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpackhi_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
+///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
+///     -31,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47,
+            24, 56, 25, 57, 26, 58, 27, 59,
+            28, 60, 29, 61, 30, 62, 31, 63,
+    ]);
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpacklo_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
+///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+        0, 32, 1, 33, 2, 34, 3, 35,
+        4, 36, 5, 37, 6, 38, 7, 39,
+        16, 48, 17, 49, 18, 50, 19, 51,
+        20, 52, 21, 53, 22, 54, 23, 55,
+    ]);
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpackhi_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle16(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+    );
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+///
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpacklo_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle16(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+    );
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpackhi_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpacklo_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpackhi_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+    mem::transmute(r)
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpacklo_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+    mem::transmute(r)
+}
+
+/// Compute the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    mem::transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
+    let imm8 = (imm8 & 31) as u32;
+    simd_extract(a.as_i8x32(), imm8)
+}
+
+/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
+    let imm8 = (imm8 & 15) as u32;
+    simd_extract(a.as_i16x16(), imm8)
+}
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
+    let imm8 = (imm8 & 7) as u32;
+    simd_extract(a.as_i32x8(), imm8)
+}
+
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movsd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    simd_extract(a, 0)
+}
+
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    simd_extract(a.as_i32x8(), 0)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx2.pabs.b"]
+    fn pabsb(a: i8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pabs.w"]
+    fn pabsw(a: i16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pabs.d"]
+    fn pabsd(a: i32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.padds.b"]
+    fn paddsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.padds.w"]
+    fn paddsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.paddus.b"]
+    fn paddusb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.paddus.w"]
+    fn paddusw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pavg.b"]
+    fn pavgb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pavg.w"]
+    fn pavgw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pblendvb"]
+    fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.phadd.w"]
+    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phadd.d"]
+    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phadd.sw"]
+    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.w"]
+    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.d"]
+    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phsub.sw"]
+    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmadd.wd"]
+    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
+    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.maskload.d"]
+    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.maskload.d.256"]
+    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.maskload.q"]
+    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.maskload.q.256"]
+    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.maskstore.d"]
+    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
+    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
+    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
+    #[link_name = "llvm.x86.avx2.maskstore.q"]
+    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
+    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
+    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
+    #[link_name = "llvm.x86.avx2.pmaxs.w"]
+    fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmaxs.d"]
+    fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmaxs.b"]
+    fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.pmaxu.w"]
+    fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmaxu.d"]
+    fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.pmaxu.b"]
+    fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pmins.w"]
+    fn pminsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmins.d"]
+    fn pminsd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmins.b"]
+    fn pminsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.pminu.w"]
+    fn pminuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pminu.d"]
+    fn pminud(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.pminu.b"]
+    fn pminub(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pmovmskb"]
+    fn pmovmskb(a: i8x32) -> i32;
+    #[link_name = "llvm.x86.avx2.mpsadbw"]
+    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmulhu.w"]
+    fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmulh.w"]
+    fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmul.dq"]
+    fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pmulu.dq"]
+    fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
+    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
+    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packsswb"]
+    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
+    #[link_name = "llvm.x86.avx2.packssdw"]
+    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packuswb"]
+    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx2.packusdw"]
+    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
+    #[link_name = "llvm.x86.avx2.psad.bw"]
+    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
+    #[link_name = "llvm.x86.avx2.psign.b"]
+    fn psignb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.psign.w"]
+    fn psignw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psign.d"]
+    fn psignd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.w"]
+    fn psllw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psll.d"]
+    fn pslld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.q"]
+    fn psllq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pslli.w"]
+    fn pslliw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pslli.d"]
+    fn psllid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pslli.q"]
+    fn pslliq(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psllv.d"]
+    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psllv.d.256"]
+    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psllv.q"]
+    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psllv.q.256"]
+    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psra.w"]
+    fn psraw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psra.d"]
+    fn psrad(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrai.w"]
+    fn psraiw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrai.d"]
+    fn psraid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrav.d"]
+    fn psravd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrav.d.256"]
+    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.w"]
+    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrl.d"]
+    fn psrld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.q"]
+    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrli.w"]
+    fn psrliw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrli.d"]
+    fn psrlid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrli.q"]
+    fn psrliq(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d"]
+    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
+    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrlv.q"]
+    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
+    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psubs.b"]
+    fn psubsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.psubs.w"]
+    fn psubsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psubus.b"]
+    fn psubusb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.psubus.w"]
+    fn psubusw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pshuf.b"]
+    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.permd"]
+    fn permd(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.permps"]
+    fn permps(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx2.vperm2i128"]
+    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d"]
+    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
+    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.gather.d.q"]
+    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
+    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d"]
+    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
+    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.q"]
+    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
+    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.pd"]
+    fn pgatherdpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
+    fn vpgatherdpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd"]
+    fn pgatherqpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i64x2,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
+    fn vpgatherqpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.d.ps"]
+    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
+        -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
+    fn vpgatherdps(
+        src: __m256,
+        slice: *const i8,
+        offsets: i32x8,
+        mask: __m256,
+        scale: i8,
+    ) -> __m256;
+    #[link_name = "llvm.x86.avx2.gather.q.ps"]
+    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
+        -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
+    fn vpgatherqps(
+        src: __m128,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m128,
+        scale: i8,
+    ) -> __m128;
+    #[link_name = "llvm.x86.avx2.psll.dq"]
+    fn vpslldq(a: i64x4, b: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrl.dq"]
+    fn vpsrldq(a: i64x4, b: i32) -> i64x4;
+}
+
+#[cfg(test)]
+mod tests {
+    use std;
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, std::i32::MAX,
+            std::i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, std::i16::MAX, std::i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, std::i16::MAX, std::i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi64() {
+        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
+        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
+        let r = _mm256_add_epi64(a, b);
+        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi32() {
+        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_add_epi32(a, b);
+        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_add_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm256_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_positive() {
+        let a = _mm256_set1_epi8(0x7F);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_negative() {
+        let a = _mm256_set1_epi8(-0x80);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_positive() {
+        let a = _mm256_set1_epi16(0x7FFF);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_negative() {
+        let a = _mm256_set1_epi16(-0x8000);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8_saturate() {
+        let a = _mm256_set1_epi8(!0);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16_saturate() {
+        let a = _mm256_set1_epi16(!0);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_and_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_and_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_andnot_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_andnot_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu8() {
+        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
+        let r = _mm256_avg_epu8(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let r = _mm256_avg_epu16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_blend_epi32() {
+        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
+        let e = _mm_setr_epi32(9, 3, 3, 3);
+        let r = _mm_blend_epi32(a, b, 0x01 as i32);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_blend_epi32(b, a, 0x0E as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi32() {
+        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
+        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi32(a, b, 0x01 as i32);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
+        let r = _mm256_blend_epi32(a, b, 0x82 as i32);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
+        let r = _mm256_blend_epi32(a, b, 0x7C as i32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi16(a, b, 0x01 as i32);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_blend_epi16(b, a, 0xFE as i32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blendv_epi8() {
+        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
+        let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
+        let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
+        let r = _mm256_blendv_epi8(a, b, mask);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastb_epi8() {
+        let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
+        let res = _mm_broadcastb_epi8(a);
+        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastb_epi8() {
+        let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
+        let res = _mm256_broadcastb_epi8(a);
+        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm_broadcastd_epi32(a);
+        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm256_broadcastd_epi32(a);
+        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm_broadcastq_epi64(a);
+        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm256_broadcastq_epi64(a);
+        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.28, 3.14);
+        let res = _mm_broadcastsd_pd(a);
+        assert_eq_m128d(res, _mm_set1_pd(6.28f64));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.28, 3.14);
+        let res = _mm256_broadcastsd_pd(a);
+        assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsi128_si256() {
+        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
+        let res = _mm256_broadcastsi128_si256(a);
+        let retval = _mm256_setr_epi64x(
+            0x0987654321012334,
+            0x5678909876543210,
+            0x0987654321012334,
+            0x5678909876543210,
+        );
+        assert_eq_m256i(res, retval);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastss_ps() {
+        let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+        let res = _mm_broadcastss_ps(a);
+        assert_eq_m128(res, _mm_set1_ps(6.28f32));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastss_ps() {
+        let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+        let res = _mm256_broadcastss_ps(a);
+        assert_eq_m256(res, _mm256_set1_ps(6.28f32));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastw_epi16() {
+        let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
+        let res = _mm_broadcastw_epi16(a);
+        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastw_epi16() {
+        let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
+        let res = _mm256_broadcastw_epi16(a);
+        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            31, 30, 2, 28, 27, 26, 25, 24,
+            23, 22, 21, 20, 19, 18, 17, 16,
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            15, 14, 2, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi32() {
+        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm256_cmpeq_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        let e = _mm256_insert_epi32(e, !0, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let b = _mm256_setr_epi64x(3, 2, 2, 0);
+        let r = _mm256_cmpeq_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi8() {
+        let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_cmpgt_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi16() {
+        let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
+        let b = _mm256_set1_epi16(0);
+        let r = _mm256_cmpgt_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi32() {
+        let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
+        let b = _mm256_set1_epi32(0);
+        let r = _mm256_cmpgt_epi32(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi64() {
+        let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_cmpgt_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi32() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi64() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi32_epi64() {
+        let a = _mm_setr_epi32(0, 0, -1, 1);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi32() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi64() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu32_epi64() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extracti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_extracti128_si256(a, 0b01);
+        let e = _mm_setr_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16(a, 0x7fff, 0);
+        let a = _mm256_insert_epi16(a, 1, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
+            4, 4, 4, 4, 8, 8, 8, 8,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16(a, 0x7fff, 0);
+        let a = _mm256_insert_epi16(a, -1, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_inserti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(7, 8);
+        let r = _mm256_inserti128_si256(a, b, 0b01);
+        let e = _mm256_setr_epi64x(1, 2, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epu32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epi32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epu32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_movemask_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_movemask_epi8(a);
+        let e = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mpsadbw_epu8(a, b, 0);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epi32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epi32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epu32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epu32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epi16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epi16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epu16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epu16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_mullo_epi32(a, b);
+        let e = _mm256_set1_epi32(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_or_si256() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_or_si256(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 11, 22, 33, 44,
+            4, 5, 6, 7, 55, 66, 77, 88,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 44, 22, 22, 11,
+            4, 5, 6, 7, 88, 66, 66, 55,
+        );
+        let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            11, 22, 33, 44, 0, 1, 2, 3,
+            55, 66, 77, 88, 4, 5, 6, 7,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            44, 22, 22, 11, 0, 1, 2, 3,
+            88, 66, 66, 55, 4, 5, 6, 7,
+        );
+        let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
+        let r = _mm256_sll_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
+        let r = _mm256_sll_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
+        let r = _mm256_sll_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi16() {
+        assert_eq_m256i(
+            _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
+            _mm256_set1_epi16(0xFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi32() {
+        assert_eq_m256i(
+            _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
+            _mm256_set1_epi32(0xFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi64() {
+        assert_eq_m256i(
+            _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
+            _mm256_set1_epi64x(0xFFFFFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_si256() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let r = _mm256_slli_si256(a, 3);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set1_epi32(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set1_epi32(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set1_epi64x(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set1_epi64x(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_sra_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
+        let r = _mm256_sra_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi16() {
+        assert_eq_m256i(
+            _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
+            _mm256_set1_epi16(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi32() {
+        assert_eq_m256i(
+            _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
+            _mm256_set1_epi32(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set1_epi32(4);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srav_epi32(a, count);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srav_epi32(a, count);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_srli_si256(a, 3);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            4, 5, 6, 7, 8, 9, 10, 11,
+            12, 13, 14, 15, 16, 0, 0, 0,
+            20, 21, 22, 23, 24, 25, 26, 27,
+            28, 29, 30, 31, 32, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
+        let r = _mm256_srl_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
+        let r = _mm256_srl_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm256_srl_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi16() {
+        assert_eq_m256i(
+            _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
+            _mm256_set1_epi16(0xF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi32() {
+        assert_eq_m256i(
+            _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
+            _mm256_set1_epi32(0xFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi64() {
+        assert_eq_m256i(
+            _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
+            _mm256_set1_epi64x(0xFFFFFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srlv_epi32(a, count);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srlv_epi32(a, count);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_srlv_epi64(a, count);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_srlv_epi64(a, count);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_sub_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_sub_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi64() {
+        let a = _mm256_set1_epi64x(4);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_sub_epi64(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_sub_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_xor_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let r = _mm256_xor_si256(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32,
+        );
+        let r = _mm256_alignr_epi8(a, b, 33);
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+
+        let r = _mm256_alignr_epi8(a, b, 17);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 0,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8(a, b, 4);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -5, -6, -7, -8, -9, -10, -11, -12,
+            -13, -14, -15, -16, 1, 2, 3, 4,
+            -21, -22, -23, -24, -25, -26, -27, -28,
+            -29, -30, -31, -32, 17, 18, 19, 20,
+        );
+        assert_eq_m256i(r, expected);
+
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16, -17,
+            -18, -19, -20, -21, -22, -23, -24, -25,
+            -26, -27, -28, -29, -30, -31, -32,
+        );
+        let r = _mm256_alignr_epi8(a, b, 16);
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8(a, b, 15);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -16, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            -32, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8(a, b, 0);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_epi64() {
+        let a = _mm256_setr_epi64x(100, 200, 300, 400);
+        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
+        let r = _mm256_permute4x64_epi64(a, 0b00010011);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
+        let e = _mm256_setr_pd(4., 1., 2., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+            4,
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i32gather_epi32(
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
+            4,
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+            4,
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+            4,
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i32gather_ps(
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
+            4,
+        );
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+            4,
+        );
+        assert_eq_m256(
+            r,
+            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+            8,
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+            8,
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+            8,
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+            8,
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+            4,
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+            4,
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+            4,
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+            4,
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+            8,
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+            8,
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+            8,
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+            8,
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        );
+        let r1 = _mm256_extract_epi8(a, 0);
+        let r2 = _mm256_extract_epi8(a, 35);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r1 = _mm256_extract_epi16(a, 0);
+        let r2 = _mm256_extract_epi16(a, 19);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi32() {
+        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm256_extract_epi32(a, 0);
+        let r2 = _mm256_extract_epi32(a, 11);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtsd_f64() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtsd_f64(a);
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtsi256_si32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtsi256_si32(a);
+        assert_eq!(r, 1);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
new file mode 100644
index 00000000000..94efadac748
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -0,0 +1,193 @@
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use mem::{self, MaybeUninit};
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    // all-0 is a properly initialized i32x16
+    let zero: i32x16 = MaybeUninit::zeroed().into_inner();
+    let sub = simd_sub(zero, a);
+    let cmp: i32x16 = simd_gt(a, zero);
+    mem::transmute(simd_select(cmp, a, sub))
+}
+
+/// Compute the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using writemask `k` (elements are copied from
+/// `src` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi32(a).as_i32x16();
+    mem::transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+}
+
+/// Compute the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
+/// the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33,34,35,35&text=_mm512_maskz_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    mem::transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Return vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_si512() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    MaybeUninit::zeroed().into_inner()
+}
+
+/// Set packed 32-bit integers in `dst` with the supplied values in reverse
+/// order.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    let r = i32x16(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    );
+    mem::transmute(r)
+}
+
+#[cfg(test)]
+mod tests {
+    use std;
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_abs_epi32(a);
+        let e = _mm512_setr_epi32(
+            0,
+            1,
+            1,
+            std::i32::MAX,
+            std::i32::MAX.wrapping_add(1),
+            100,
+            100,
+            32,
+            0,
+            1,
+            1,
+            std::i32::MAX,
+            std::i32::MAX.wrapping_add(1),
+            100,
+            100,
+            32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_mask_abs_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi32(a, 0b11111111, a);
+        let e = _mm512_setr_epi32(
+            0,
+            1,
+            1,
+            std::i32::MAX,
+            std::i32::MAX.wrapping_add(1),
+            100,
+            100,
+            32,
+            0,
+            1,
+            -1,
+            std::i32::MAX,
+            std::i32::MIN,
+            100,
+            -100,
+            -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+            0, 1, -1, std::i32::MAX,
+            std::i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_maskz_abs_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi32(0b11111111, a);
+        let e = _mm512_setr_epi32(
+            0,
+            1,
+            1,
+            std::i32::MAX,
+            std::i32::MAX.wrapping_add(1),
+            100,
+            100,
+            32,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi1.rs b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
new file mode 100644
index 00000000000..ece4d76dc3d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
@@ -0,0 +1,178 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    x86_bmi_bextr_32(a, control)
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
+    !a & b
+}
+
+/// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsi_u32(x: u32) -> u32 {
+    x & x.wrapping_neg()
+}
+
+/// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_sub(1_u32))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsr_u32(x: u32) -> u32 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
+    x.trailing_zeros() as i32
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u32() {
+        let r = _bextr_u32(0b0101_0000u32, 4, 4);
+        assert_eq!(r, 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u32() {
+        assert_eq!(_andn_u32(0, 0), 0);
+        assert_eq!(_andn_u32(0, 1), 1);
+        assert_eq!(_andn_u32(1, 0), 0);
+        assert_eq!(_andn_u32(1, 1), 0);
+
+        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
+        assert_eq!(r, 0b0001_1101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u32() {
+        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u32() {
+        let r = _blsmsk_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0001_1111u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u32() {
+        // TODO: test the behavior when the input is 0
+        let r = _blsr_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0010_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u32() {
+        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi2.rs b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
new file mode 100644
index 00000000000..ab8cab3138b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
@@ -0,0 +1,133 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u32)
+#[inline]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
+#[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
+    let result: u64 = (a as u64) * (b as u64);
+    *hi = (result >> 32) as u32;
+    result as u32
+}
+
+/// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    x86_bmi2_bzhi_32(a, index)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pdep_32(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pext_32(a, mask)
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.32"]
+    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pdep.32"]
+    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pext.32"]
+    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0000_0011_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b0001_0111_0100_0011u32;
+
+        assert_eq!(_pext_u32(n, m0), s0);
+        assert_eq!(_pext_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0010_0000_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b1110_1001_0010_0011u32;
+
+        assert_eq!(_pdep_u32(n, m0), s0);
+        assert_eq!(_pdep_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u32() {
+        let n = 0b1111_0010u32;
+        let s = 0b0001_0010u32;
+        assert_eq!(_bzhi_u32(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_mulx_u32() {
+        let a: u32 = 4_294_967_200;
+        let b: u32 = 2;
+        let mut hi = 0;
+        let lo = _mulx_u32(a, b, &mut hi);
+        /*
+        result = 8589934400
+               = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
+                   ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                */
+        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
+        assert_eq!(hi, 0b0001u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bswap.rs b/library/stdarch/crates/core_arch/src/x86/bswap.rs
new file mode 100644
index 00000000000..ee6d6615b14
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bswap.rs
@@ -0,0 +1,35 @@
+//! Byte swap intrinsics.
+
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap(x: i32) -> i32 {
+    bswap_i32(x)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.bswap.i32"]
+    fn bswap_i32(x: i32) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap() {
+        unsafe {
+            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
+            assert_eq!(_bswap(0x00000000), 0x00000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/cpuid.rs b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
new file mode 100644
index 00000000000..adf3e127d64
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
@@ -0,0 +1,187 @@
+//! `cpuid` intrinsics
+
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Result of the `cpuid` instruction.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub struct CpuidResult {
+    /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub eax: u32,
+    /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ebx: u32,
+    /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ecx: u32,
+    /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub edx: u32,
+}
+
+/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
+/// and
+/// `sub_leaf` (`ECX`).
+///
+/// The highest-supported leaf value is returned by the first tuple argument of
+/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containung
+/// sub-leaves, the second tuple argument returns the highest-supported
+/// sub-leaf
+/// value.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
+/// information using the `EAX` and `ECX` registers, and the interpretation of
+/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
+///
+/// The references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
+    let eax;
+    let ebx;
+    let ecx;
+    let edx;
+    #[cfg(target_arch = "x86")]
+    {
+        asm!("cpuid"
+             : "={eax}"(eax), "={ebx}"(ebx), "={ecx}"(ecx), "={edx}"(edx)
+             : "{eax}"(leaf), "{ecx}"(sub_leaf)
+             : :);
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        // x86-64 uses %rbx as the base register, so preserve it.
+        asm!("cpuid\n"
+             : "={eax}"(eax), "={ebx}"(ebx), "={ecx}"(ecx), "={edx}"(edx)
+             : "{eax}"(leaf), "{ecx}"(sub_leaf)
+             : "rbx" :);
+    }
+    CpuidResult { eax, ebx, ecx, edx }
+}
+
+/// See [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
+    __cpuid_count(leaf, 0)
+}
+
+/// Does the host support the `cpuid` instruction?
+#[inline]
+pub fn has_cpuid() -> bool {
+    #[cfg(target_env = "sgx")]
+    {
+        false
+    }
+    #[cfg(all(not(target_env = "sgx"), target_arch = "x86_64"))]
+    {
+        true
+    }
+    #[cfg(all(not(target_env = "sgx"), target_arch = "x86"))]
+    {
+        // Optimization for i586 and i686 Rust targets which SSE enabled
+        // and support cpuid:
+        #[cfg(target_feature = "sse")]
+        {
+            true
+        }
+
+        // If SSE is not enabled, detect whether cpuid is available:
+        #[cfg(not(target_feature = "sse"))]
+        unsafe {
+            // On `x86` the `cpuid` instruction is not always available.
+            // This follows the approach indicated in:
+            // http://wiki.osdev.org/CPUID#Checking_CPUID_availability
+            // https://software.intel.com/en-us/articles/using-cpuid-to-detect-the-presence-of-sse-41-and-sse-42-instruction-sets/
+            // which detects whether `cpuid` is available by checking whether
+            // the 21st bit of the EFLAGS register is modifiable or not.
+            // If it is, then `cpuid` is available.
+            let result: u32;
+            let _temp: u32;
+            asm!(r#"
+                 # Read eflags into $0 and copy it into $1:
+                 pushfd
+                 pop     $0
+                 mov     $1, $0
+                 # Flip 21st bit of $0.
+                 xor     $0, 0x200000
+                 # Set eflags to the value of $0
+                 #
+                 # Bit 21st can only be modified if cpuid is available
+                 push    $0
+                 popfd          # A
+                 # Read eflags into $0:
+                 pushfd         # B
+                 pop     $0
+                 # xor with the original eflags sets the bits that
+                 # have been modified:
+                 xor     $0, $1
+                 "#
+                 : "=r"(result), "=r"(_temp)
+                 :
+                 : "cc", "memory"
+                 : "intel");
+            // There is a race between popfd (A) and pushfd (B)
+            // where other bits beyond 21st may have been modified due to
+            // interrupts, a debugger stepping through the asm, etc.
+            //
+            // Therefore, explicitly check whether the 21st bit
+            // was modified or not.
+            //
+            // If the result is zero, the cpuid bit was not modified.
+            // If the result is 0x200000 (non-zero), then the cpuid
+            // was correctly modified and the CPU supports the cpuid
+            // instruction:
+            (result & 0x200000) != 0
+        }
+    }
+}
+
+/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
+/// values.
+///
+/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
+/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
+/// containing sub-leafs, the second tuple argument contains the
+/// highest-supported sub-leaf value.
+///
+/// See also [`__cpuid`](fn.__cpuid.html) and
+/// [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
+    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
+    (eax, ebx)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+
+    #[test]
+    fn test_always_has_cpuid() {
+        // all currently-tested targets have the instruction
+        // FIXME: add targets without `cpuid` to CI
+        assert!(cpuid::has_cpuid());
+    }
+
+    #[test]
+    fn test_has_cpuid_idempotent() {
+        assert_eq!(cpuid::has_cpuid(), cpuid::has_cpuid());
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/eflags.rs b/library/stdarch/crates/core_arch/src/x86/eflags.rs
new file mode 100644
index 00000000000..68c0b6ee840
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/eflags.rs
@@ -0,0 +1,83 @@
+//! `i386` intrinsics
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(
+    since = "1.29.0",
+    reason = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u32 {
+    let eflags: u32;
+    asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile");
+    eflags
+}
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(
+    since = "1.29.0",
+    reason = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u64 {
+    let eflags: u64;
+    asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile");
+    eflags
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(
+    since = "1.29.0",
+    reason = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u32) {
+    asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile");
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(
+    since = "1.29.0",
+    reason = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u64) {
+    asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile");
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+
+    #[test]
+    #[allow(deprecated)]
+    fn test_eflags() {
+        unsafe {
+            // reads eflags, writes them back, reads them again,
+            // and compare for equality:
+            let v = __readeflags();
+            __writeeflags(v);
+            let u = __readeflags();
+            assert_eq!(v, u);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fma.rs b/library/stdarch/crates/core_arch/src/x86/fma.rs
new file mode 100644
index 00000000000..4915c44f574
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fma.rs
@@ -0,0 +1,802 @@
+//! Fused Multiply-Add instruction set (FMA)
+//!
+//! The FMA instruction set is an extension to the 128 and 256-bit SSE
+//! instructions in the x86 microprocessor instruction set to perform fused
+//! multiply–add (FMA) operations.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
+//! instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use core_arch::x86::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmaddpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmaddpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmaddps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmaddps256(a, b, c)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Store the result in the lower element of the returned value, and copy the
+/// upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmaddsd(a, b, c)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Store the result in the lower element of the returned value, and copy the
+/// 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmaddss(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmaddsubpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmaddsubpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmaddsubps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmaddsubps256(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmsubpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmsubps256(a, b, c)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubsd(a, b, c)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubss(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubaddpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmsubaddpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubaddps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmsubaddps256(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmaddpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfnmaddpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmaddps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfnmaddps256(a, b, c)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmaddsd(a, b, c)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmaddss(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmsubpd(a, b, c)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfnmsubpd256(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmsubps(a, b, c)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfnmsubps256(a, b, c)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the returned
+/// value, and copy the upper element from `a` to the upper elements of the
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmsubsd(a, b, c)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the
+/// returned value, and copy the 3 upper elements from `a` to the upper
+/// elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmsubss(a, b, c)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fma.vfmadd.pd"]
+    fn vfmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmadd.pd.256"]
+    fn vfmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmadd.ps"]
+    fn vfmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmadd.ps.256"]
+    fn vfmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfmadd.sd"]
+    fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmadd.ss"]
+    fn vfmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmaddsub.pd"]
+    fn vfmaddsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmaddsub.pd.256"]
+    fn vfmaddsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmaddsub.ps"]
+    fn vfmaddsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmaddsub.ps.256"]
+    fn vfmaddsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfmsub.pd"]
+    fn vfmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsub.pd.256"]
+    fn vfmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmsub.ps"]
+    fn vfmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsub.ps.256"]
+    fn vfmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfmsub.sd"]
+    fn vfmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsub.ss"]
+    fn vfmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsubadd.pd"]
+    fn vfmsubaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsubadd.pd.256"]
+    fn vfmsubaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmsubadd.ps"]
+    fn vfmsubaddps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsubadd.ps.256"]
+    fn vfmsubaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmadd.pd"]
+    fn vfnmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmadd.pd.256"]
+    fn vfnmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfnmadd.ps"]
+    fn vfnmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmadd.ps.256"]
+    fn vfnmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmadd.sd"]
+    fn vfnmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmadd.ss"]
+    fn vfnmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmsub.pd"]
+    fn vfnmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmsub.pd.256"]
+    fn vfnmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfnmsub.ps"]
+    fn vfnmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmsub.ps.256"]
+    fn vfnmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmsub.sd"]
+    fn vfnmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmsub.ss"]
+    fn vfnmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
+}
+
+#[cfg(test)]
+mod tests {
+    use std;
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 15.);
+        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., 15., 22., 15.);
+        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 15., 22., 15.);
+        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
+        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 2.);
+        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 2., 3., 4.);
+        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 15.);
+        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., 15., 20., 15.);
+        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 15., 20., 15.);
+        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
+        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., -3.);
+        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., -3., 20., 1.);
+        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., -3., 20., 1.);
+        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
+        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 2.);
+        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 2., 3., 4.);
+        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., -3.);
+        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., -3., 22., 1.);
+        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., -3., 22., 1.);
+        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
+        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 3.);
+        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-1., 3., -20., -1.);
+        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 3., -20., -1.);
+        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
+        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 2.);
+        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., -15.);
+        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-9., -15., -22., -15.);
+        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., -15., -22., -15.);
+        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
+        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., 2.);
+        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fxsr.rs b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
new file mode 100644
index 00000000000..df511972dbd
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
@@ -0,0 +1,112 @@
+//! FXSR floating-point context fast save and restor.
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fxsave"]
+    fn fxsave(p: *mut u8) -> ();
+    #[link_name = "llvm.x86.fxrstor"]
+    fn fxrstor(p: *const u8) -> ();
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave(mem_addr: *mut u8) {
+    fxsave(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor(mem_addr: *const u8) {
+    fxrstor(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdsimd_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<FxsaveArea> for FxsaveArea {
+        fn eq(&self, other: &FxsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for FxsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    unsafe fn fxsave() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave(a.ptr());
+        fxsr::_fxrstor(a.ptr());
+        fxsr::_fxsave(b.ptr());
+        assert_eq!(a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/macros.rs b/library/stdarch/crates/core_arch/src/x86/macros.rs
new file mode 100644
index 00000000000..b8c283f1f47
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/macros.rs
@@ -0,0 +1,109 @@
+//! Utility macros.
+
+macro_rules! constify_imm6 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            _ => $expand!(31),
+        }
+    };
+}
+
+macro_rules! constify_imm4 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            _ => $expand!(15),
+        }
+    };
+}
+
+macro_rules! constify_imm3 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            _ => $expand!(7),
+        }
+    };
+}
+
+macro_rules! constify_imm2 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b11 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            _ => $expand!(3),
+        }
+    };
+}
+
+#[cfg(test)]
+macro_rules! assert_approx_eq {
+    ($a:expr, $b:expr, $eps:expr) => {{
+        let (a, b) = (&$a, &$b);
+        assert!(
+            (*a - *b).abs() < $eps,
+            "assertion failed: `(left !== right)` \
+             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
+            *a,
+            *b,
+            $eps,
+            (*a - *b).abs()
+        );
+    }};
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/mmx.rs b/library/stdarch/crates/core_arch/src/x86/mmx.rs
new file mode 100644
index 00000000000..82f085cf95f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/mmx.rs
@@ -0,0 +1,794 @@
+//! `i586` MMX instruction set.
+//!
+//! The intrinsics here roughly correspond to those in the `mmintrin.h` C
+//! header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use core_arch::simd::*;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Constructs a 64-bit integer vector initialized to zero.
+#[inline]
+#[target_feature(enable = "mmx")]
+// FIXME: this produces a movl instead of xorps on x86
+// FIXME: this produces a xor intrinsic instead of xorps on x86_64
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(xor))]
+pub unsafe fn _mm_setzero_si64() -> __m64 {
+    mem::transmute(0_i64)
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddb))]
+pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 {
+    paddb(a, b)
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddb))]
+pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 {
+    _mm_add_pi8(a, b)
+}
+
+/// Add packed 16-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddw))]
+pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 {
+    paddw(a, b)
+}
+
+/// Add packed 16-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddw))]
+pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 {
+    _mm_add_pi16(a, b)
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddd))]
+pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 {
+    paddd(a, b)
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddd))]
+pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 {
+    _mm_add_pi32(a, b)
+}
+
+/// Add packed 8-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddsb))]
+pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 {
+    paddsb(a, b)
+}
+
+/// Add packed 8-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddsb))]
+pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 {
+    _mm_adds_pi8(a, b)
+}
+
+/// Add packed 16-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddsw))]
+pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 {
+    paddsw(a, b)
+}
+
+/// Add packed 16-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddsw))]
+pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 {
+    _mm_adds_pi16(a, b)
+}
+
+/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddusb))]
+pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 {
+    paddusb(a, b)
+}
+
+/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddusb))]
+pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 {
+    _mm_adds_pu8(a, b)
+}
+
+/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddusw))]
+pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 {
+    paddusw(a, b)
+}
+
+/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(paddusw))]
+pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 {
+    _mm_adds_pu16(a, b)
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubb))]
+pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 {
+    psubb(a, b)
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubb))]
+pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 {
+    _mm_sub_pi8(a, b)
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubw))]
+pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 {
+    psubw(a, b)
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubw))]
+pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 {
+    _mm_sub_pi16(a, b)
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubd))]
+pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 {
+    psubd(a, b)
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubd))]
+pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 {
+    _mm_sub_pi32(a, b)
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubsb))]
+pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 {
+    psubsb(a, b)
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubsb))]
+pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 {
+    _mm_subs_pi8(a, b)
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubsw))]
+pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 {
+    psubsw(a, b)
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubsw))]
+pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 {
+    _mm_subs_pi16(a, b)
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubusb))]
+pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 {
+    psubusb(a, b)
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubusb))]
+pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 {
+    _mm_subs_pu8(a, b)
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
+/// 16-bit integers in `a` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubusw))]
+pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 {
+    psubusw(a, b)
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
+/// 16-bit integers in `a` using saturation.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(psubusw))]
+pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 {
+    _mm_subs_pu16(a, b)
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(packsswb))]
+pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
+    packsswb(a, b)
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(packssdw))]
+pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
+    packssdw(a, b)
+}
+
+/// Compares whether each element of `a` is greater than the corresponding
+/// element of `b` returning `0` for `false` and `-1` for `true`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
+    pcmpgtb(a, b)
+}
+
+/// Compares whether each element of `a` is greater than the corresponding
+/// element of `b` returning `0` for `false` and `-1` for `true`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
+    pcmpgtw(a, b)
+}
+
+/// Compares whether each element of `a` is greater than the corresponding
+/// element of `b` returning `0` for `false` and `-1` for `true`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
+    pcmpgtd(a, b)
+}
+
+/// Unpacks the upper two elements from two `i16x4` vectors and interleaves
+/// them into the result: `[a.2, b.2, a.3, b.3]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
+pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
+    punpckhwd(a, b)
+}
+
+/// Unpacks the upper four elements from two `i8x8` vectors and interleaves
+/// them into the result: `[a.4, b.4, a.5, b.5, a.6, b.6, a.7, b.7]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpckhbw))]
+pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
+    punpckhbw(a, b)
+}
+
+/// Unpacks the lower four elements from two `i8x8` vectors and interleaves
+/// them into the result: `[a.0, b.0, a.1, b.1, a.2, b.2, a.3, b.3]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpcklbw))]
+pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
+    punpcklbw(a, b)
+}
+
+/// Unpacks the lower two elements from two `i16x4` vectors and interleaves
+/// them into the result: `[a.0 b.0 a.1 b.1]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpcklwd))]
+pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
+    punpcklwd(a, b)
+}
+
+/// Unpacks the upper element from two `i32x2` vectors and interleaves them
+/// into the result: `[a.1, b.1]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpckhdq))]
+pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
+    punpckhdq(a, b)
+}
+
+/// Unpacks the lower element from two `i32x2` vectors and interleaves them
+/// into the result: `[a.0, b.0]`.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(punpckldq))]
+pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
+    punpckldq(a, b)
+}
+
+/// Set packed 16-bit integers in dst with the supplied values.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set_pi16(e3: i16, e2: i16, e1: i16, e0: i16) -> __m64 {
+    _mm_setr_pi16(e0, e1, e2, e3)
+}
+
+/// Set packed 32-bit integers in dst with the supplied values.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
+    _mm_setr_pi32(e0, e1)
+}
+
+/// Set packed 8-bit integers in dst with the supplied values.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set_pi8(e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8) -> __m64 {
+    _mm_setr_pi8(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Broadcast 16-bit integer a to all all elements of dst.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set1_pi16(a: i16) -> __m64 {
+    _mm_setr_pi16(a, a, a, a)
+}
+
+/// Broadcast 32-bit integer a to all all elements of dst.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set1_pi32(a: i32) -> __m64 {
+    _mm_setr_pi32(a, a)
+}
+
+/// Broadcast 8-bit integer a to all all elements of dst.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 {
+    _mm_setr_pi8(a, a, a, a, a, a, a, a)
+}
+
+/// Set packed 16-bit integers in dst with the supplied values in reverse
+/// order.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 {
+    mem::transmute(i16x4::new(e0, e1, e2, e3))
+}
+
+/// Set packed 32-bit integers in dst with the supplied values in reverse
+/// order.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
+    mem::transmute(i32x2::new(e0, e1))
+}
+
+/// Set packed 8-bit integers in dst with the supplied values in reverse order.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_setr_pi8(
+    e0: i8,
+    e1: i8,
+    e2: i8,
+    e3: i8,
+    e4: i8,
+    e5: i8,
+    e6: i8,
+    e7: i8,
+) -> __m64 {
+    mem::transmute(i8x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+
+/// Empty the MMX state, which marks the x87 FPU registers as available for use
+/// by x87 instructions. This instruction must be used at the end of all MMX
+/// technology procedures.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(emms))]
+pub unsafe fn _mm_empty() {
+    emms()
+}
+
+/// Empty the MMX state, which marks the x87 FPU registers as available for use
+/// by x87 instructions. This instruction must be used at the end of all MMX
+/// technology procedures.
+#[inline]
+#[target_feature(enable = "mmx")]
+#[cfg_attr(test, assert_instr(emms))]
+pub unsafe fn _m_empty() {
+    emms()
+}
+
+/// Copy 32-bit integer `a` to the lower elements of the return value, and zero
+/// the upper element of the return value.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_cvtsi32_si64(a: i32) -> __m64 {
+    mem::transmute(i32x2::new(a, 0))
+}
+
+/// Return the lower 32-bit integer in `a`.
+#[inline]
+#[target_feature(enable = "mmx")]
+pub unsafe fn _mm_cvtsi64_si32(a: __m64) -> i32 {
+    let r: i32x2 = mem::transmute(a);
+    r.0
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.mmx.padd.b"]
+    fn paddb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.padd.w"]
+    fn paddw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.padd.d"]
+    fn paddd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.padds.b"]
+    fn paddsb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.padds.w"]
+    fn paddsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.paddus.b"]
+    fn paddusb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.paddus.w"]
+    fn paddusw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psub.b"]
+    fn psubb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psub.w"]
+    fn psubw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psub.d"]
+    fn psubd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psubs.b"]
+    fn psubsb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psubs.w"]
+    fn psubsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psubus.b"]
+    fn psubusb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psubus.w"]
+    fn psubusw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.packsswb"]
+    fn packsswb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.packssdw"]
+    fn packssdw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pcmpgt.b"]
+    fn pcmpgtb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pcmpgt.w"]
+    fn pcmpgtw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pcmpgt.d"]
+    fn pcmpgtd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpckhwd"]
+    fn punpckhwd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpcklwd"]
+    fn punpcklwd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpckhbw"]
+    fn punpckhbw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpcklbw"]
+    fn punpcklbw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpckhdq"]
+    fn punpckhdq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpckldq"]
+    fn punpckldq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.emms"]
+    fn emms();
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_setzero_si64() {
+        let r: __m64 = ::std::mem::transmute(0_i64);
+        assert_eq_m64(r, _mm_setzero_si64());
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_add_pi8() {
+        let a = _mm_setr_pi8(-1, -1, 1, 1, -1, 0, 1, 0);
+        let b = _mm_setr_pi8(-127, 101, 99, 126, 0, -1, 0, 1);
+        let e = _mm_setr_pi8(-128, 100, 100, 127, -1, -1, 1, 1);
+        assert_eq_m64(e, _mm_add_pi8(a, b));
+        assert_eq_m64(e, _m_paddb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_add_pi16() {
+        let a = _mm_setr_pi16(-1, -1, 1, 1);
+        let b = _mm_setr_pi16(i16::min_value() + 1, 30001, -30001, i16::max_value() - 1);
+        let e = _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value());
+        assert_eq_m64(e, _mm_add_pi16(a, b));
+        assert_eq_m64(e, _m_paddw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_add_pi32() {
+        let a = _mm_setr_pi32(1, -1);
+        let b = _mm_setr_pi32(i32::max_value() - 1, i32::min_value() + 1);
+        let e = _mm_setr_pi32(i32::max_value(), i32::min_value());
+        assert_eq_m64(e, _mm_add_pi32(a, b));
+        assert_eq_m64(e, _m_paddd(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_adds_pi8() {
+        let a = _mm_setr_pi8(-100, -1, 1, 100, -1, 0, 1, 0);
+        let b = _mm_setr_pi8(-100, 1, -1, 100, 0, -1, 0, 1);
+        let e = _mm_setr_pi8(i8::min_value(), 0, 0, i8::max_value(), -1, -1, 1, 1);
+        assert_eq_m64(e, _mm_adds_pi8(a, b));
+        assert_eq_m64(e, _m_paddsb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_adds_pi16() {
+        let a = _mm_setr_pi16(-32000, 32000, 4, 0);
+        let b = _mm_setr_pi16(-32000, 32000, -5, 1);
+        let e = _mm_setr_pi16(i16::min_value(), i16::max_value(), -1, 1);
+        assert_eq_m64(e, _mm_adds_pi16(a, b));
+        assert_eq_m64(e, _m_paddsw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_adds_pu8() {
+        let a = _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 200u8 as i8);
+        let b = _mm_setr_pi8(0, 10, 20, 30, 40, 50, 60, 200u8 as i8);
+        let e = _mm_setr_pi8(0, 11, 22, 33, 44, 55, 66, u8::max_value() as i8);
+        assert_eq_m64(e, _mm_adds_pu8(a, b));
+        assert_eq_m64(e, _m_paddusb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_adds_pu16() {
+        let a = _mm_setr_pi16(0, 1, 2, 60000u16 as i16);
+        let b = _mm_setr_pi16(0, 10, 20, 60000u16 as i16);
+        let e = _mm_setr_pi16(0, 11, 22, u16::max_value() as i16);
+        assert_eq_m64(e, _mm_adds_pu16(a, b));
+        assert_eq_m64(e, _m_paddusw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_sub_pi8() {
+        let a = _mm_setr_pi8(0, 0, 1, 1, -1, -1, 0, 0);
+        let b = _mm_setr_pi8(-1, 1, -2, 2, 100, -100, -127, 127);
+        let e = _mm_setr_pi8(1, -1, 3, -1, -101, 99, 127, -127);
+        assert_eq_m64(e, _mm_sub_pi8(a, b));
+        assert_eq_m64(e, _m_psubb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_sub_pi16() {
+        let a = _mm_setr_pi16(-20000, -20000, 20000, 30000);
+        let b = _mm_setr_pi16(-10000, 10000, -10000, 30000);
+        let e = _mm_setr_pi16(-10000, -30000, 30000, 0);
+        assert_eq_m64(e, _mm_sub_pi16(a, b));
+        assert_eq_m64(e, _m_psubw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_sub_pi32() {
+        let a = _mm_setr_pi32(500_000, -500_000);
+        let b = _mm_setr_pi32(500_000, 500_000);
+        let e = _mm_setr_pi32(0, -1_000_000);
+        assert_eq_m64(e, _mm_sub_pi32(a, b));
+        assert_eq_m64(e, _m_psubd(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_subs_pi8() {
+        let a = _mm_setr_pi8(-100, 100, 0, 0, 0, 0, -5, 5);
+        let b = _mm_setr_pi8(100, -100, i8::min_value(), 127, -1, 1, 3, -3);
+        let e = _mm_setr_pi8(
+            i8::min_value(),
+            i8::max_value(),
+            i8::max_value(),
+            -127,
+            1,
+            -1,
+            -8,
+            8,
+        );
+        assert_eq_m64(e, _mm_subs_pi8(a, b));
+        assert_eq_m64(e, _m_psubsb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_subs_pi16() {
+        let a = _mm_setr_pi16(-20000, 20000, 0, 0);
+        let b = _mm_setr_pi16(20000, -20000, -1, 1);
+        let e = _mm_setr_pi16(i16::min_value(), i16::max_value(), 1, -1);
+        assert_eq_m64(e, _mm_subs_pi16(a, b));
+        assert_eq_m64(e, _m_psubsw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_subs_pu8() {
+        let a = _mm_setr_pi8(50, 10, 20, 30, 40, 60, 70, 80);
+        let b = _mm_setr_pi8(60, 20, 30, 40, 30, 20, 10, 0);
+        let e = _mm_setr_pi8(0, 0, 0, 0, 10, 40, 60, 80);
+        assert_eq_m64(e, _mm_subs_pu8(a, b));
+        assert_eq_m64(e, _m_psubusb(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_subs_pu16() {
+        let a = _mm_setr_pi16(10000, 200, 0, 44444u16 as i16);
+        let b = _mm_setr_pi16(20000, 300, 1, 11111);
+        let e = _mm_setr_pi16(0, 0, 0, 33333u16 as i16);
+        assert_eq_m64(e, _mm_subs_pu16(a, b));
+        assert_eq_m64(e, _m_psubusw(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_packs_pi16() {
+        let a = _mm_setr_pi16(-1, 2, -3, 4);
+        let b = _mm_setr_pi16(-5, 6, -7, 8);
+        let r = _mm_setr_pi8(-1, 2, -3, 4, -5, 6, -7, 8);
+        assert_eq_m64(r, _mm_packs_pi16(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_packs_pi32() {
+        let a = _mm_setr_pi32(-1, 2);
+        let b = _mm_setr_pi32(-5, 6);
+        let r = _mm_setr_pi16(-1, 2, -5, 6);
+        assert_eq_m64(r, _mm_packs_pi32(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_cmpgt_pi8() {
+        let a = _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_pi8(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm_setr_pi8(0, 0, 0, 0, 0, -1, -1, -1);
+        assert_eq_m64(r, _mm_cmpgt_pi8(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_cmpgt_pi16() {
+        let a = _mm_setr_pi16(0, 1, 2, 3);
+        let b = _mm_setr_pi16(4, 3, 2, 1);
+        let r = _mm_setr_pi16(0, 0, 0, -1);
+        assert_eq_m64(r, _mm_cmpgt_pi16(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_cmpgt_pi32() {
+        let a = _mm_setr_pi32(0, 3);
+        let b = _mm_setr_pi32(1, 2);
+        let r0 = _mm_setr_pi32(0, -1);
+        let r1 = _mm_setr_pi32(-1, 0);
+
+        assert_eq_m64(r0, _mm_cmpgt_pi32(a, b));
+        assert_eq_m64(r1, _mm_cmpgt_pi32(b, a));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpackhi_pi8() {
+        let a = _mm_setr_pi8(0, 3, 4, 7, 8, 11, 12, 15);
+        let b = _mm_setr_pi8(1, 2, 5, 6, 9, 10, 13, 14);
+        let r = _mm_setr_pi8(8, 9, 11, 10, 12, 13, 15, 14);
+
+        assert_eq_m64(r, _mm_unpackhi_pi8(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpacklo_pi8() {
+        let a = _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_pi8(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_setr_pi8(0, 8, 1, 9, 2, 10, 3, 11);
+        assert_eq_m64(r, _mm_unpacklo_pi8(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpackhi_pi16() {
+        let a = _mm_setr_pi16(0, 1, 2, 3);
+        let b = _mm_setr_pi16(4, 5, 6, 7);
+        let r = _mm_setr_pi16(2, 6, 3, 7);
+        assert_eq_m64(r, _mm_unpackhi_pi16(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpacklo_pi16() {
+        let a = _mm_setr_pi16(0, 1, 2, 3);
+        let b = _mm_setr_pi16(4, 5, 6, 7);
+        let r = _mm_setr_pi16(0, 4, 1, 5);
+        assert_eq_m64(r, _mm_unpacklo_pi16(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpackhi_pi32() {
+        let a = _mm_setr_pi32(0, 3);
+        let b = _mm_setr_pi32(1, 2);
+        let r = _mm_setr_pi32(3, 2);
+
+        assert_eq_m64(r, _mm_unpackhi_pi32(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_unpacklo_pi32() {
+        let a = _mm_setr_pi32(0, 3);
+        let b = _mm_setr_pi32(1, 2);
+        let r = _mm_setr_pi32(0, 1);
+
+        assert_eq_m64(r, _mm_unpacklo_pi32(a, b));
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_empty() {
+        _mm_empty();
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_m_empty() {
+        _m_empty();
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_cvtsi32_si64() {
+        let a = _mm_cvtsi32_si64(42);
+        let b = _mm_setr_pi32(42, 0);
+        assert_eq_m64(a, b);
+    }
+
+    #[simd_test(enable = "mmx")]
+    unsafe fn test_mm_cvtsi64_si32() {
+        let a = _mm_setr_pi32(42, 666);
+        let b = _mm_cvtsi64_si32(a);
+        assert_eq!(b, 42);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
new file mode 100644
index 00000000000..ee7c5219fe8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -0,0 +1,617 @@
+//! `x86` and `x86_64` intrinsics.
+
+use mem;
+use prelude::v1::*;
+
+#[macro_use]
+mod macros;
+
+types! {
+    /// 64-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m64` type defined by Intel,
+    /// representing a 64-bit SIMD register. Usage of this type typically
+    /// corresponds to the `mmx` target feature.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x8` - eight `i8` variables packed together
+    /// * `i16x4` - four `i16` variables packed together
+    /// * `i32x2` - two `i32` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m64` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// Most intrinsics using `__m64` are prefixed with `_mm_` and the
+    /// integer types tend to correspond to suffixes like "pi8" or "pi32" (not
+    /// to be confused with "epiXX", used for `__m128i`).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(stdsimd, mmx_target_feature)]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "mmx")]
+    /// # unsafe fn foo() {
+    /// let all_bytes_zero = _mm_setzero_si64();
+    /// let all_bytes_one = _mm_set1_pi8(1);
+    /// let two_i32 = _mm_set_pi32(1, 2);
+    /// # }
+    /// # if is_x86_feature_detected!("mmx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    pub struct __m64(i64);
+
+    /// 128-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m128i` type defined by Intel,
+    /// representing a 128-bit SIMD register. Usage of this type typically
+    /// corresponds to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x16` - sixteen `i8` variables packed together
+    /// * `i16x8` - eight `i16` variables packed together
+    /// * `i32x4` - four `i32` variables packed together
+    /// * `i64x2` - two `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m128i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// Most intrinsics using `__m128i` are prefixed with `_mm_` and the
+    /// integer types tend to correspond to suffixes like "epi8" or "epi32".
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse2")]
+    /// # unsafe fn foo() {
+    /// let all_bytes_zero = _mm_setzero_si128();
+    /// let all_bytes_one = _mm_set1_epi8(1);
+    /// let four_i32 = _mm_set_epi32(1, 2, 3, 4);
+    /// # }
+    /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128i(i64, i64);
+
+    /// 128-bit wide set of four `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m128` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// four packed `f32` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128` type has *one* interpretation. Each instance
+    /// of `__m128` always corresponds to `f32x4`, or four `f32` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m128` are prefixed with `_mm_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m128d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse")]
+    /// # unsafe fn foo() {
+    /// let four_zeros = _mm_setzero_ps();
+    /// let four_ones = _mm_set1_ps(1.0);
+    /// let four_floats = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+    /// # }
+    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128(f32, f32, f32, f32);
+
+    /// 128-bit wide set of two `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m128d` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// two packed `f64` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128d` type has *one* interpretation. Each instance
+    /// of `__m128d` always corresponds to `f64x2`, or two `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m128d` are prefixed with `_mm_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m128`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse")]
+    /// # unsafe fn foo() {
+    /// let two_zeros = _mm_setzero_pd();
+    /// let two_ones = _mm_set1_pd(1.0);
+    /// let two_floats = _mm_set_pd(1.0, 2.0);
+    /// # }
+    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128d(f64, f64);
+
+    /// 256-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m256i` type defined by Intel,
+    /// representing a 256-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x32` - thirty two `i8` variables packed together
+    /// * `i16x16` - sixteen `i16` variables packed together
+    /// * `i32x8` - eight `i32` variables packed together
+    /// * `i64x4` - four `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m256i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let all_bytes_zero = _mm256_setzero_si256();
+    /// let all_bytes_one = _mm256_set1_epi8(1);
+    /// let eight_i32 = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256i(i64, i64, i64, i64);
+
+    /// 256-bit wide set of eight `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m256` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256` type has *one* interpretation. Each instance
+    /// of `__m256` always corresponds to `f32x8`, or eight `f32` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m256` are prefixed with `_mm256_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m256d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let eight_zeros = _mm256_setzero_ps();
+    /// let eight_ones = _mm256_set1_ps(1.0);
+    /// let eight_floats = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256(f32, f32, f32, f32, f32, f32, f32, f32);
+
+    /// 256-bit wide set of four `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m256d` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// four packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256d` type has *one* interpretation. Each instance
+    /// of `__m256d` always corresponds to `f64x4`, or four `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m256d` are prefixed with `_mm256_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m256`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![cfg_attr(not(dox), feature(stdsimd))]
+    /// # #![cfg_attr(not(dox), no_std)]
+    /// # #[cfg(not(dox))]
+    /// # extern crate std as real_std;
+    /// # #[cfg(not(dox))]
+    /// # extern crate core_arch as std;
+    /// # #[macro_use(is_x86_feature_detected)]
+    /// # extern crate std_detect;
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let four_zeros = _mm256_setzero_pd();
+    /// let four_ones = _mm256_set1_pd(1.0);
+    /// let four_floats = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256d(f64, f64, f64, f64);
+
+    /// 512-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m512i` type defined by Intel,
+    /// representing a 512-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x64` - sixty-four `i8` variables packed together
+    /// * `i16x32` - thirty-two `i16` variables packed together
+    /// * `i32x16` - sixteen `i32` variables packed together
+    /// * `i64x8` - eight `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m512i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    pub struct __m512i(i64, i64, i64, i64, i64, i64, i64, i64);
+
+    /// 512-bit wide set of sixteen `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m512` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512` type has *one* interpretation. Each instance
+    /// of `__m512` always corresponds to `f32x16`, or sixteen `f32` types
+    /// packed together.
+    ///
+    /// Most intrinsics using `__m512` are prefixed with `_mm512_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m512d`.
+    pub struct __m512(
+        f32, f32, f32, f32, f32, f32, f32, f32,
+        f32, f32, f32, f32, f32, f32, f32, f32,
+    );
+
+    /// 512-bit wide set of eight `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m512d` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512d` type has *one* interpretation. Each instance
+    /// of `__m512d` always corresponds to `f64x4`, or eight `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m512d` are prefixed with `_mm512_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m512`.
+    pub struct __m512d(f64, f64, f64, f64, f64, f64, f64, f64);
+}
+
+/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer
+#[allow(non_camel_case_types)]
+pub type __mmask16 = i16;
+
+#[cfg(test)]
+mod test;
+#[cfg(test)]
+pub use self::test::*;
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "0")]
+pub(crate) trait m128iExt: Sized {
+    fn as_m128i(self) -> __m128i;
+
+    #[inline]
+    fn as_u8x16(self) -> ::core_arch::simd::u8x16 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u16x8(self) -> ::core_arch::simd::u16x8 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u32x4(self) -> ::core_arch::simd::u32x4 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u64x2(self) -> ::core_arch::simd::u64x2 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i8x16(self) -> ::core_arch::simd::i8x16 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i16x8(self) -> ::core_arch::simd::i16x8 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i32x4(self) -> ::core_arch::simd::i32x4 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i64x2(self) -> ::core_arch::simd::i64x2 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+}
+
+impl m128iExt for __m128i {
+    #[inline]
+    fn as_m128i(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "0")]
+pub(crate) trait m256iExt: Sized {
+    fn as_m256i(self) -> __m256i;
+
+    #[inline]
+    fn as_u8x32(self) -> ::core_arch::simd::u8x32 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u16x16(self) -> ::core_arch::simd::u16x16 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u32x8(self) -> ::core_arch::simd::u32x8 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u64x4(self) -> ::core_arch::simd::u64x4 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i8x32(self) -> ::core_arch::simd::i8x32 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i16x16(self) -> ::core_arch::simd::i16x16 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i32x8(self) -> ::core_arch::simd::i32x8 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i64x4(self) -> ::core_arch::simd::i64x4 {
+        unsafe { mem::transmute(self.as_m256i()) }
+    }
+}
+
+impl m256iExt for __m256i {
+    #[inline]
+    fn as_m256i(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "0")]
+pub(crate) trait m512iExt: Sized {
+    fn as_m512i(self) -> __m512i;
+
+    #[inline]
+    fn as_i32x16(self) -> ::core_arch::simd::i32x16 {
+        unsafe { mem::transmute(self.as_m512i()) }
+    }
+}
+
+impl m512iExt for __m512i {
+    #[inline]
+    fn as_m512i(self) -> Self {
+        self
+    }
+}
+
+mod eflags;
+pub use self::eflags::*;
+
+mod fxsr;
+pub use self::fxsr::*;
+
+mod bswap;
+pub use self::bswap::*;
+
+mod rdtsc;
+pub use self::rdtsc::*;
+
+mod cpuid;
+pub use self::cpuid::*;
+mod xsave;
+pub use self::xsave::*;
+
+mod sse;
+pub use self::sse::*;
+mod sse2;
+pub use self::sse2::*;
+mod sse3;
+pub use self::sse3::*;
+mod ssse3;
+pub use self::ssse3::*;
+mod sse41;
+pub use self::sse41::*;
+mod sse42;
+pub use self::sse42::*;
+mod avx;
+pub use self::avx::*;
+mod avx2;
+pub use self::avx2::*;
+mod fma;
+pub use self::fma::*;
+
+mod abm;
+pub use self::abm::*;
+mod bmi1;
+pub use self::bmi1::*;
+
+mod bmi2;
+pub use self::bmi2::*;
+
+#[cfg(not(stdsimd_intel_sde))]
+mod sse4a;
+#[cfg(not(stdsimd_intel_sde))]
+pub use self::sse4a::*;
+
+#[cfg(not(stdsimd_intel_sde))]
+mod tbm;
+#[cfg(not(stdsimd_intel_sde))]
+pub use self::tbm::*;
+
+mod mmx;
+pub use self::mmx::*;
+
+mod pclmulqdq;
+pub use self::pclmulqdq::*;
+
+mod aes;
+pub use self::aes::*;
+
+mod rdrand;
+pub use self::rdrand::*;
+
+mod sha;
+pub use self::sha::*;
+
+mod adx;
+pub use self::adx::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Generates the trap instruction `UD2`
+#[cfg_attr(test, assert_instr(ud2))]
+#[inline]
+pub unsafe fn ud2() -> ! {
+    ::intrinsics::abort()
+}
+
+mod avx512f;
+pub use self::avx512f::*;
diff --git a/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
new file mode 100644
index 00000000000..c047b3b1cd6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
@@ -0,0 +1,74 @@
+//! Carry-less Multiplication (CLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq"]
+    fn pclmulqdq(a: __m128i, round_key: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Perform a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2^k).
+///
+/// The immediate byte is used for determining which halves of `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+#[cfg_attr(all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_clmulepi64_si128(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            pclmulqdq(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "pclmulqdq")]
+    unsafe fn test_mm_clmulepi64_si128() {
+        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+
+        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x00), r00);
+        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x10), r01);
+        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x01), r10);
+        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x11), r11);
+
+        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+        assert_eq_m128i(_mm_clmulepi64_si128(a0, a0, 0x00), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdrand.rs b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
new file mode 100644
index 00000000000..63573f689d6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
@@ -0,0 +1,76 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.16"]
+    fn x86_rdrand16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdrand.32"]
+    fn x86_rdrand32_step() -> (u32, i32);
+    #[link_name = "llvm.x86.rdseed.16"]
+    fn x86_rdseed16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdseed.32"]
+    fn x86_rdseed32_step() -> (u32, i32);
+}
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Read a hardware generated 16-bit random value and store the result in val.
+/// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand16_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdrand16_step();
+    *val = v;
+    flag
+}
+
+/// Read a hardware generated 32-bit random value and store the result in val.
+/// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand32_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdrand32_step();
+    *val = v;
+    flag
+}
+
+/// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed16_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdseed16_step();
+    *val = v;
+    flag
+}
+
+/// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed32_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdseed32_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdtsc.rs b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
new file mode 100644
index 00000000000..5ed5d496111
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
@@ -0,0 +1,77 @@
+//! RDTSC instructions.
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Reads the current value of the processor’s time-stamp counter.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSC instruction is not a serializing instruction. It does
+/// not necessarily wait until all previous instructions have been
+/// executed before reading the counter. Similarly, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdtsc)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdtsc() -> i64 {
+    rdtsc()
+}
+
+/// Reads the current value of the processor’s time-stamp counter and
+/// the `IA32_TSC_AUX MSR`.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSCP instruction waits until all previous instructions have
+/// been executed before reading the counter. However, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__rdtscp)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
+    rdtscp(aux as *mut _)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.rdtsc"]
+    fn rdtsc() -> i64;
+    #[link_name = "llvm.x86.rdtscp"]
+    fn rdtscp(aux: *mut u8) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn _rdtsc() {
+        let r = rdtsc::_rdtsc();
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn _rdtscp() {
+        let mut aux = 0;
+        let r = rdtsc::__rdtscp(&mut aux);
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sha.rs b/library/stdarch/crates/core_arch/src/x86/sha.rs
new file mode 100644
index 00000000000..98bf4707f8c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sha.rs
@@ -0,0 +1,224 @@
+use core_arch::simd::*;
+use core_arch::x86::*;
+use mem;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sha1msg1"]
+    fn sha1msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1msg2"]
+    fn sha1msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1nexte"]
+    fn sha1nexte(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1rnds4"]
+    fn sha1rnds4(a: i32x4, b: i32x4, c: i8) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg1"]
+    fn sha256msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg2"]
+    fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256rnds2"]
+    fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Perform an intermediate calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and returning the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(sha1msg1(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Perform the final calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using the intermediate result in `a` and the
+/// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(sha1msg2(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Calculate SHA1 state variable E after four rounds of operation from the
+/// current SHA1 state variable `a`, add that value to the scheduled values
+/// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1nexte_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(sha1nexte(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
+/// from `a` and some pre-computed sum of the next 4 round message values
+/// (unsigned 32-bit integers), and state variable E from `b`, and return the
+/// updated SHA1 state (A,B,C,D). `func` contains the logic functions and round
+/// constants.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1rnds4_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1rnds4, func = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1rnds4_epu32(a: __m128i, b: __m128i, func: i32) -> __m128i {
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    macro_rules! call {
+        ($imm2:expr) => {
+            sha1rnds4(a, b, $imm2)
+        };
+    }
+    let ret = constify_imm2!(func, call);
+    mem::transmute(ret)
+}
+
+/// Perform an intermediate calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(sha256msg1(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Perform the final calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(sha256msg2(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Perform 2 rounds of SHA256 operation using an initial SHA256 state
+/// (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a
+/// pre-computed sum of the next 2 round message values (unsigned 32-bit
+/// integers) and the corresponding round constants from `k`, and store the
+/// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256rnds2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
+    mem::transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4()))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::f32;
+    use std::f64::{self, NAN};
+    use std::i32;
+    use std::mem::{self, transmute};
+
+    use core_arch::simd::*;
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+    use test::black_box; // Used to inhibit constant-folding.
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x98829f34f74ad457, 0xda2b1a44d0b5ad3c);
+        let r = _mm_sha1msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35);
+        let r = _mm_sha1msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1nexte_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x2589d5be923f82a4, 0x59f111f13956c25b);
+        let r = _mm_sha1nexte_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1rnds4_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
+        let r = _mm_sha1rnds4_epu32(a, b, 0);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
+        let r = _mm_sha1rnds4_epu32(a, b, 1);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
+        let r = _mm_sha1rnds4_epu32(a, b, 2);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
+        let r = _mm_sha1rnds4_epu32(a, b, 3);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee);
+        let r = _mm_sha256msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xb58777ce887fd851, 0x15d1ec8b73ac8450);
+        let r = _mm_sha256msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256rnds2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let k = _mm_set_epi64x(0, 0x12835b01d807aa98);
+        let expected = _mm_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19);
+        let r = _mm_sha256rnds2_epu32(a, b, k);
+        assert_eq_m128i(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
new file mode 100644
index 00000000000..8d1c237f539
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -0,0 +1,4161 @@
+//! Streaming SIMD Extensions (SSE)
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use intrinsics;
+use mem;
+use ptr;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Adds the first component of `a` and `b`, the other components are copied
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
+    addss(a, b)
+}
+
+/// Adds __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
+    simd_add(a, b)
+}
+
+/// Subtracts the first component of `b` from `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
+    subss(a, b)
+}
+
+/// Subtracts __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
+    simd_sub(a, b)
+}
+
+/// Multiplies the first component of `a` and `b`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
+    mulss(a, b)
+}
+
+/// Multiplies __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
+    simd_mul(a, b)
+}
+
+/// Divides the first component of `b` by `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
+    divss(a, b)
+}
+
+/// Divides __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
+    simd_div(a, b)
+}
+
+/// Return the square root of the first single-precision (32-bit)
+/// floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
+    sqrtss(a)
+}
+
+/// Return the square root of packed single-precision (32-bit) floating-point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
+    sqrtps(a)
+}
+
+/// Return the approximate reciprocal of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
+    rcpss(a)
+}
+
+/// Return the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
+    rcpps(a)
+}
+
+/// Return the approximate reciprocal square root of the fist single-precision
+/// (32-bit) floating-point elements in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
+    rsqrtss(a)
+}
+
+/// Return the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
+    rsqrtps(a)
+}
+
+/// Compare the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the minimum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
+    minss(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
+    minps(a, b)
+}
+
+/// Compare the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the maximum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
+    maxss(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
+    maxps(a, b)
+}
+
+/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `and` instructions, so ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_and(a, b))
+}
+
+/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// Computes `!a & b` for each bit in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `not` and `and` instructions, so ignore
+// it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andnps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    let mask: __m128i = mem::transmute(i32x4::splat(-1));
+    mem::transmute(simd_and(simd_xor(mask, a), b))
+}
+
+/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `or` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(orps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_or(a, b))
+}
+
+/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `xor` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(xorps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_xor(a, b))
+}
+
+/// Compare the lowest `f32` of both inputs for equality. The lowest 32 bits of
+/// the result will be `0xffffffff` if the two inputs are equal, or `0`
+/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 0)
+}
+
+/// Compare the lowest `f32` of both inputs for less than. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 1)
+}
+
+/// Compare the lowest `f32` of both inputs for less than or equal. The lowest
+/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 2)
+}
+
+/// Compare the lowest `f32` of both inputs for greater than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
+}
+
+/// Compare the lowest `f32` of both inputs for greater than or equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
+/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
+}
+
+/// Compare the lowest `f32` of both inputs for inequality. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 4)
+}
+
+/// Compare the lowest `f32` of both inputs for not-less-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 5)
+}
+
+/// Compare the lowest `f32` of both inputs for not-less-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 6)
+}
+
+/// Compare the lowest `f32` of both inputs for not-greater-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
+/// the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
+}
+
+/// Compare the lowest `f32` of both inputs for not-greater-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
+/// bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
+}
+
+/// Check if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
+/// the result will be `0xffffffff` if neither of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 7)
+}
+
+/// Check if the lowest `f32` of both inputs are unordered. The lowest 32 bits
+/// of the result will be `0xffffffff` if any of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 3)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// were equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 0)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 1)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 2)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 1)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 2)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// are *not* equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 4)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is *not* less than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 5)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is *not* less than or equal to the corresponding element in `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 6)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is *not* greater than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 5)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is *not* greater than or equal to the corresponding element in `b`,
+/// or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 6)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 7)
+}
+
+/// Compare each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 3)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
+    comieq_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
+    comilt_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
+    comile_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
+    comigt_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
+    comige_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are *not* equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
+    comineq_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise. This instruction will not signal
+/// an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
+    ucomieq_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+/// This instruction will not signal an exception if either argument is a quiet
+/// NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
+    ucomilt_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
+    ucomile_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
+    ucomigt_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise. This instruction will not signal an exception if either
+/// argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
+    ucomige_ss(a, b)
+}
+
+/// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are *not* equal, or `0` otherwise. This instruction will not
+/// signal an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
+    ucomineq_ss(a, b)
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 32 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
+/// (`std::i32::MIN`) or an invalid operation floating point exception if
+/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
+    cvtss2si(a)
+}
+
+/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
+    _mm_cvtss_si32(a)
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 32 bit integer
+/// with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 32 bit integer the result will be
+/// `0x8000_0000` (`std::i32::MIN`) or an invalid operation floating point
+/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
+    cvttss2si(a)
+}
+
+/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
+    _mm_cvttss_si32(a)
+}
+
+/// Extract the lowest 32 bit float from the input vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
+#[inline]
+#[target_feature(enable = "sse")]
+// No point in using assert_instrs. In Unix x86_64 calling convention this is a
+// no-op, and on Windows it's just a `mov`.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
+    simd_extract(a, 0)
+}
+
+/// Convert a 32 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
+/// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
+    cvtsi2ss(a, b)
+}
+
+/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
+    _mm_cvtsi32_ss(a, b)
+}
+
+/// Construct a `__m128` with the lowest element set to `a` and the rest set to
+/// zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
+    __m128(a, 0.0, 0.0, 0.0)
+}
+
+/// Construct a `__m128` with all element set to `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
+    __m128(a, a, a, a)
+}
+
+/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
+    _mm_set1_ps(a)
+}
+
+/// Construct a `__m128` from four floating point values highest to lowest.
+///
+/// Note that `a` will be the highest 32 bits of the result, and `d` the
+/// lowest. This matches the standard way of writing bit patterns on x86:
+///
+/// ```text
+///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
+///        +---------+---------+---------+---------+
+///        |    a    |    b    |    c    |    d    |   result
+///        +---------+---------+---------+---------+
+/// ```
+///
+/// Alternatively:
+///
+/// ```text
+/// let v = _mm_set_ps(d, c, b, a);
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128(d, c, b, a)
+}
+
+/// Construct a `__m128` from four floating point values lowest to highest.
+///
+/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
+/// bits of the result, and `d` the highest.
+///
+/// ```text
+/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))]
+// On a 32-bit architecture it just copies the operands from the stack.
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128(a, b, c, d)
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_ps() -> __m128 {
+    __m128(0.0, 0.0, 0.0, 0.0)
+}
+
+/// A utility function for creating masks to use with Intel shuffle and
+/// permute intrinsics.
+#[inline]
+#[allow(non_snake_case)]
+#[unstable(feature = "stdsimd", issue = "27731")]
+pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
+    ((z << 6) | (y << 4) | (x << 2) | w) as i32
+}
+
+/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `mask`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps, mask = 3))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
+    let mask = (mask & 0xFF) as u8;
+
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (mask >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 4),
+                0b01 => shuffle_done!($x01, $x23, $x45, 5),
+                0b10 => shuffle_done!($x01, $x23, $x45, 6),
+                _ => shuffle_done!($x01, $x23, $x45, 7),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (mask >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 4),
+                0b01 => shuffle_x67!($x01, $x23, 5),
+                0b10 => shuffle_x67!($x01, $x23, 6),
+                _ => shuffle_x67!($x01, $x23, 7),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (mask >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    match mask & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the higher half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, b, [2, 6, 3, 7])
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the lower half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, b, [0, 4, 1, 5])
+}
+
+/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
+/// lower half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
+    // TODO; figure why this is a different instruction on Windows?
+    simd_shuffle4(a, b, [6, 7, 2, 3])
+}
+
+/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
+/// higher half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, b, [0, 1, 4, 5])
+}
+
+/// Return a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 4 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
+    movmskps(a)
+}
+
+/// Set the upper two single-precision floating-point values with 64 bits of
+/// data loaded from the address `p`; the lower two values are passed through
+/// from `a`.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(
+        test,
+        any(
+            target_arch = "x86_64",
+            all(target_arch = "x86", target_feature = "sse2")
+        )
+    ),
+    assert_instr(movhpd)
+)]
+// FIXME: 32-bit codegen without SSE2 generates two `shufps` instead of `movhps`
+#[cfg_attr(
+    all(test, target_arch = "x86", not(target_feature = "sse2")),
+    assert_instr(shufps)
+)]
+// TODO: This function is actually not limited to floats, but that's what
+// what matches the C type most closely: (__m128, *const __m64) -> __m128
+pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
+    let q = p as *const f32x2;
+    let b: f32x2 = *q;
+    let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
+    simd_shuffle4(a, bb, [0, 1, 4, 5])
+}
+
+/// Load two floats from `p` into the lower half of a `__m128`. The upper half
+/// is copied from the upper half of `a`.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))]
+#[cfg_attr(
+    all(test, target_arch = "x86", target_feature = "sse2"),
+    assert_instr(movlpd)
+)]
+// FIXME: On 32-bit targets without SSE2, it just generates two `movss`...
+#[cfg_attr(
+    all(test, target_arch = "x86", not(target_feature = "sse2")),
+    assert_instr(movss)
+)]
+pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
+    let q = p as *const f32x2;
+    let b: f32x2 = *q;
+    let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
+    simd_shuffle4(a, bb, [4, 5, 2, 3])
+}
+
+/// Construct a `__m128` with the lowest element read from `p` and the other
+/// elements set to zero.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
+    __m128(*p, 0.0, 0.0, 0.0)
+}
+
+/// Construct a `__m128` by duplicating the value read from `p` into all
+/// elements.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
+    let a = *p;
+    __m128(a, a, a, a)
+}
+
+/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
+    _mm_load1_ps(p)
+}
+
+/// Load four `f32` values from *aligned* memory into a `__m128`. If the
+/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
+    *(p as *const __m128)
+}
+
+/// Load four `f32` values from memory into a `__m128`. There are no
+/// restrictions
+/// on memory alignment. For aligned memory
+/// [`_mm_load_ps`](fn._mm_load_ps.html)
+/// may be faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
+    // Note: Using `*p` would require `f32` alignment, but `movups` has no
+    // alignment restrictions.
+    let mut dst = _mm_undefined_ps();
+    ptr::copy_nonoverlapping(
+        p as *const u8,
+        &mut dst as *mut __m128 as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+    dst
+}
+
+/// Load four `f32` values from aligned memory into a `__m128` in reverse
+/// order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let a0 = *p;
+/// let a1 = *p.offset(1);
+/// let a2 = *p.offset(2);
+/// let a3 = *p.offset(3);
+/// __m128::new(a3, a2, a1, a0)
+/// ```
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
+    let a = _mm_load_ps(p);
+    simd_shuffle4(a, a, [3, 2, 1, 0])
+}
+
+/// Store the upper half of `a` (64 bits) into memory.
+///
+/// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may
+/// choose to generate an equivalent sequence of other instructions.
+#[inline]
+#[target_feature(enable = "sse")]
+// On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
+// fine.
+// On i586 (no SSE2) it just generates plain MOV instructions.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2"),
+        not(target_os = "windows")),
+    // assert_instr(movhpd)
+    assert_instr(movhps) // LLVM7 prefers single-precision instructions
+)]
+pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
+    #[cfg(target_arch = "x86")]
+    {
+        // If this is a `f64x2` then on i586, LLVM generates fldl & fstpl which
+        // is just silly
+        let a64: u64x2 = mem::transmute(a);
+        let a_hi = a64.extract(1);
+        *(p as *mut u64) = a_hi;
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        // If this is a `u64x2` LLVM generates a pshufd + movq, but we really
+        // want a a MOVHPD or MOVHPS here.
+        let a64: f64x2 = mem::transmute(a);
+        let a_hi = a64.extract(1);
+        *p = mem::transmute(a_hi);
+    }
+}
+
+/// Store the lower half of `a` (64 bits) into memory.
+///
+/// This intrinsic corresponds to the `MOVQ` instruction. The compiler may
+/// choose to generate an equivalent sequence of other instructions.
+#[inline]
+#[target_feature(enable = "sse")]
+// On i586 the codegen just generates plane MOVs. No need to test for that.
+#[cfg_attr(
+    all(
+        test,
+        any(target_arch = "x86_64", target_feature = "sse2"),
+        not(target_os = "windows")
+    ),
+    assert_instr(movlps)
+)]
+pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
+    #[cfg(target_arch = "x86")]
+    {
+        // Same as for _mm_storeh_pi: i586 code gen would use floating point
+        // stack.
+        let a64: u64x2 = mem::transmute(a);
+        let a_hi = a64.extract(0);
+        *(p as *mut u64) = a_hi;
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        let a64: f64x2 = mem::transmute(a);
+        let a_hi = a64.extract(0);
+        *p = mem::transmute(a_hi);
+    }
+}
+
+/// Store the lowest 32 bit float of `a` into memory.
+///
+/// This intrinsic corresponds to the `MOVSS` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
+    *p = simd_extract(a, 0);
+}
+
+/// Store the lowest 32 bit float of `a` repeated four times into *aligned*
+/// memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let x = a.extract(0);
+/// *p = x;
+/// *p.offset(1) = x;
+/// *p.offset(2) = x;
+/// *p.offset(3) = x;
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
+    _mm_store1_ps(p, a);
+}
+
+/// Store four 32-bit floats into *aligned* memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
+    *(p as *mut __m128) = a;
+}
+
+/// Store four 32-bit floats into memory. There are no restrictions on memory
+/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
+/// faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
+    ptr::copy_nonoverlapping(
+        &a as *const __m128 as *const u8,
+        p as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+}
+
+/// Store four 32-bit floats into *aligned* memory in reverse order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// *p = a.extract(3);
+/// *p.offset(1) = a.extract(2);
+/// *p.offset(2) = a.extract(1);
+/// *p.offset(3) = a.extract(0);
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Return a `__m128` with the first component from `b` and the remaining
+/// components from `a`.
+///
+/// In other words for any `a` and `b`:
+/// ```text
+/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4(a, b, [4, 1, 2, 3])
+}
+
+/// Perform a serializing operation on all store-to-memory instructions that
+/// were issued prior to this instruction.
+///
+/// Guarantees that every store instruction that precedes, in program order, is
+/// globally visible before any store instruction which follows the fence in
+/// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sfence() {
+    sfence()
+}
+
+/// Get the unsigned 32-bit value of the MXCSR control and status register.
+///
+/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(stmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_getcsr() -> u32 {
+    let mut result = 0_i32;
+    stmxcsr((&mut result) as *mut _ as *mut i8);
+    result as u32
+}
+
+/// Set the MXCSR register with the 32-bit unsigned integer value.
+///
+/// This register constrols how SIMD instructions handle floating point
+/// operations. Modifying this register only affects the current thread.
+///
+/// It contains several groups of flags:
+///
+/// * *Exception flags* report which exceptions occurred since last they were
+/// reset.
+///
+/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
+/// default
+/// these flags are all set to 1, so all exceptions are masked. When an
+/// an exception is masked, the processor simply sets the exception flag and
+/// continues the operation. If the exception is unmasked, the flag is also set
+/// but additionally an exception handler is invoked.
+///
+/// * *Rounding mode flags* control the rounding mode of floating point
+/// instructions.
+///
+/// * The *denormals-are-zero mode flag* turns all numbers which would be
+/// denormalized (exponent bits are all zeros) into zeros.
+///
+/// ## Exception Flags
+///
+/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
+///   Infinity by Infinity).
+///
+/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
+///   number. Mainly this can cause loss of precision.
+///
+/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
+///
+/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
+/// result was too large to be represented (e.g., an `f32` with absolute
+/// value
+///   greater than `2^128`).
+///
+/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
+/// result was too small to be represented in a normalized way (e.g., an
+/// `f32`
+///   with absulte value smaller than `2^-126`.)
+///
+/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
+///   precision exception). This means some precision was lost due to rounding.
+///   For example, the fraction `1/3` cannot be represented accurately in a
+///   32 or 64 bit float and computing it would cause this exception to be
+///   raised. Precision exceptions are very common, so they are usually masked.
+///
+/// Exception flags can be read and set using the convenience functions
+/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
+/// check if an operation caused some overflow:
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
+///                             // perform calculations
+/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
+///     // handle overflow
+/// }
+/// ```
+///
+/// ## Masking Flags
+///
+/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
+/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+///
+/// A single masking bit can be set via
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
+/// ```
+///
+/// However, since mask bits are by default all set to 1, it is more common to
+/// want to *disable* certain bits. For example, to unmask the underflow
+/// exception, use:
+///
+/// ```rust,ignore
+/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
+/// exception
+/// ```
+///
+/// Warning: an unmasked exception will cause an exception handler to be
+/// called.
+/// The standard handler will simply terminate the process. So, in this case
+/// any underflow exception would terminate the current process with something
+/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
+///
+/// ## Rounding Mode
+///
+/// The rounding mode is describe using two bits. It can be read and set using
+/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
+/// `_MM_SET_ROUNDING_MODE(mode)`.
+///
+/// The rounding modes are:
+///
+/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
+///   value. If two values are equally close, round to even (i.e., least
+///   significant bit will be zero).
+///
+/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
+///
+/// * `_MM_ROUND_UP`: Round toward positive Infinity.
+///
+/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
+///
+/// Example:
+///
+/// ```rust,ignore
+/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
+/// ```
+///
+/// ## Denormals-are-zero/Flush-to-zero Mode
+///
+/// If this bit is set, values that would be denormalized will be set to zero
+/// instead. This is turned off by default.
+///
+/// You can read and enable/disable this mode via the helper functions
+/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
+///
+/// ```rust,ignore
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
+/// ```
+///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ldmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setcsr(val: u32) {
+    ldmxcsr(&val as *const _ as *const i8);
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
+/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_MASK: u32 = 0x003f;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INVALID: u32 = 0x0080;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DENORM: u32 = 0x0100;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INEXACT: u32 = 0x1000;
+/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_MASK: u32 = 0x1f80;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_NEAREST: u32 = 0x0000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_DOWN: u32 = 0x2000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_UP: u32 = 0x4000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+
+/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_MASK: u32 = 0x6000;
+
+/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
+    _mm_getcsr() & _MM_MASK_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
+    _mm_getcsr() & _MM_EXCEPT_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
+    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
+    _mm_getcsr() & _MM_ROUND_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
+    let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
+    // println!("setting csr={:x}", val);
+    _mm_setcsr(val)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
+}
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T0: i32 = 3;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T1: i32 = 2;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T2: i32 = 1;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_NTA: i32 = 0;
+
+/// Fetch the cache line that contains address `p` using the given `strategy`.
+///
+/// The `strategy` must be one of:
+///
+/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
+///   cache hierachy.
+///
+/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
+///
+/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
+/// an   implementation-specific choice (e.g., L2 if there is no L3).
+///
+/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
+///   non-temporal access (NTA) hint. It may be a place closer than main memory
+///   but outside of the cache hierarchy. This is used to reduce access latency
+///   without polluting the cache.
+///
+/// The actual implementation depends on the particular CPU. This instruction
+/// is considered a hint, so the CPU is also free to simply ignore the request.
+///
+/// The amount of prefetched data depends on the cache line size of the
+/// specific CPU, but it will be at least 32 bytes.
+///
+/// Common caveats:
+///
+/// * Most modern CPUs already automatically prefetch data based on predicted
+///   access patterns.
+///
+/// * Data is usually not fetched if this would cause a TLB miss or a page
+///   fault.
+///
+/// * Too much prefetching can cause unnecessary cache evictions.
+///
+/// * Prefetching may also fail if there are not enough memory-subsystem
+///   resources (e.g., request buffers).
+///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
+#[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
+#[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))]
+#[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_prefetch(p: *const i8, strategy: i32) {
+    // The `strategy` must be a compile-time constant, so we use a short form
+    // of `constify_imm8!` for now.
+    // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and
+    // `cache type` = 1 (data cache). `locality` is based on our `strategy`.
+    macro_rules! pref {
+        ($imm8:expr) => {
+            match $imm8 {
+                0 => prefetch(p, 0, 0, 1),
+                1 => prefetch(p, 0, 1, 1),
+                2 => prefetch(p, 0, 2, 1),
+                _ => prefetch(p, 0, 3, 1),
+            }
+        };
+    }
+    pref!(strategy)
+}
+
+/// Return vector of type __m128 with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_ps() -> __m128 {
+    // FIXME: this function should return MaybeUninit<__m128>
+    mem::MaybeUninit::<__m128>::uninitialized().into_inner()
+}
+
+/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_TRANSPOSE4_PS(
+    row0: &mut __m128,
+    row1: &mut __m128,
+    row2: &mut __m128,
+    row3: &mut __m128,
+) {
+    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
+    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
+    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
+    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
+
+    *row0 = _mm_movelh_ps(tmp0, tmp2);
+    *row1 = _mm_movehl_ps(tmp2, tmp0);
+    *row2 = _mm_movelh_ps(tmp1, tmp3);
+    *row3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse.add.ss"]
+    fn addss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sub.ss"]
+    fn subss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.mul.ss"]
+    fn mulss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.div.ss"]
+    fn divss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sqrt.ss"]
+    fn sqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sqrt.ps"]
+    fn sqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ss"]
+    fn rcpss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ps"]
+    fn rcpps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ss"]
+    fn rsqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ps"]
+    fn rsqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ss"]
+    fn minss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ps"]
+    fn minps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ss"]
+    fn maxss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ps"]
+    fn maxps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.movmsk.ps"]
+    fn movmskps(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.sse.comieq.ss"]
+    fn comieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comilt.ss"]
+    fn comilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comile.ss"]
+    fn comile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comigt.ss"]
+    fn comigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comige.ss"]
+    fn comige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comineq.ss"]
+    fn comineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomieq.ss"]
+    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomilt.ss"]
+    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomile.ss"]
+    fn ucomile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomigt.ss"]
+    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomige.ss"]
+    fn ucomige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomineq.ss"]
+    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si"]
+    fn cvtss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvttss2si"]
+    fn cvttss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtsi2ss"]
+    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
+    #[link_name = "llvm.x86.sse.sfence"]
+    fn sfence();
+    #[link_name = "llvm.x86.sse.stmxcsr"]
+    fn stmxcsr(p: *mut i8);
+    #[link_name = "llvm.x86.sse.ldmxcsr"]
+    fn ldmxcsr(p: *const i8);
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.mmx.movnt.dq"]
+    fn movntdq(a: *mut __m64, b: __m64);
+    #[link_name = "llvm.x86.sse.cvtpi2ps"]
+    fn cvtpi2ps(a: __m128, b: __m64) -> __m128;
+    #[link_name = "llvm.x86.mmx.maskmovq"]
+    fn maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8);
+    #[link_name = "llvm.x86.mmx.pextr.w"]
+    fn pextrw(a: __m64, imm8: i32) -> i32;
+    #[link_name = "llvm.x86.mmx.pinsr.w"]
+    fn pinsrw(a: __m64, d: i32, imm8: i32) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmovmskb"]
+    fn pmovmskb(a: __m64) -> i32;
+    #[link_name = "llvm.x86.sse.pshuf.w"]
+    fn pshufw(a: __m64, imm8: i8) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmaxs.w"]
+    fn pmaxsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmaxu.b"]
+    fn pmaxub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmins.w"]
+    fn pminsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pminu.b"]
+    fn pminub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmulhu.w"]
+    fn pmulhuw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmull.w"]
+    fn pmullw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pavg.b"]
+    fn pavgb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pavg.w"]
+    fn pavgw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psad.bw"]
+    fn psadbw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtps2pi"]
+    fn cvtps2pi(a: __m128) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttps2pi"]
+    fn cvttps2pi(a: __m128) -> __m64;
+}
+
+/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception _may_ be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
+}
+
+/// Store 64-bits of integer data from a into memory using a non-temporal
+/// memory hint.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(movntq))]
+pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) {
+    movntdq(mem_addr, a)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+pub unsafe fn _mm_max_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaxsw(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+pub unsafe fn _m_pmaxsw(a: __m64, b: __m64) -> __m64 {
+    _mm_max_pi16(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+pub unsafe fn _mm_max_pu8(a: __m64, b: __m64) -> __m64 {
+    pmaxub(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+pub unsafe fn _m_pmaxub(a: __m64, b: __m64) -> __m64 {
+    _mm_max_pu8(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminsw))]
+pub unsafe fn _mm_min_pi16(a: __m64, b: __m64) -> __m64 {
+    pminsw(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminsw))]
+pub unsafe fn _m_pminsw(a: __m64, b: __m64) -> __m64 {
+    _mm_min_pi16(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminub))]
+pub unsafe fn _mm_min_pu8(a: __m64, b: __m64) -> __m64 {
+    pminub(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminub))]
+pub unsafe fn _m_pminub(a: __m64, b: __m64) -> __m64 {
+    _mm_min_pu8(a, b)
+}
+
+/// Multiplies packed 16-bit unsigned integer values and writes the
+/// high-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 {
+    pmulhuw(a, b)
+}
+
+/// Multiplies packed 16-bit integer values and writes the
+/// low-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmullw))]
+pub unsafe fn _mm_mullo_pi16(a: __m64, b: __m64) -> __m64 {
+    pmullw(a, b)
+}
+
+/// Multiplies packed 16-bit unsigned integer values and writes the
+/// high-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+pub unsafe fn _m_pmulhuw(a: __m64, b: __m64) -> __m64 {
+    _mm_mulhi_pu16(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 8-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgb))]
+pub unsafe fn _mm_avg_pu8(a: __m64, b: __m64) -> __m64 {
+    pavgb(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 8-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgb))]
+pub unsafe fn _m_pavgb(a: __m64, b: __m64) -> __m64 {
+    _mm_avg_pu8(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 16-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgw))]
+pub unsafe fn _mm_avg_pu16(a: __m64, b: __m64) -> __m64 {
+    pavgw(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 16-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgw))]
+pub unsafe fn _m_pavgw(a: __m64, b: __m64) -> __m64 {
+    _mm_avg_pu16(a, b)
+}
+
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
+/// 64-bit vector operands and computes the absolute value for each of the
+/// difference. Then sum of the 8 absolute differences is written to the
+/// bits `[15:0]` of the destination; the remaining bits `[63:16]` are cleared.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(psadbw))]
+pub unsafe fn _mm_sad_pu8(a: __m64, b: __m64) -> __m64 {
+    psadbw(a, b)
+}
+
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
+/// 64-bit vector operands and computes the absolute value for each of the
+/// difference. Then sum of the 8 absolute differences is written to the
+/// bits `[15:0]` of the destination; the remaining bits `[63:16]` are cleared.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(psadbw))]
+pub unsafe fn _m_psadbw(a: __m64, b: __m64) -> __m64 {
+    _mm_sad_pu8(a, b)
+}
+
+/// Converts two elements of a 64-bit vector of `[2 x i32]` into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi32_ps(a: __m128, b: __m64) -> __m128 {
+    cvtpi2ps(a, b)
+}
+
+/// Converts two elements of a 64-bit vector of `[2 x i32]` into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvt_pi2ps(a: __m128, b: __m64) -> __m128 {
+    _mm_cvtpi32_ps(a, b)
+}
+
+/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_cmpgt_pi8(b, a);
+    let b = _mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(b)
+}
+
+/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(b)
+}
+
+/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi16_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_cmpgt_pi16(b, a);
+    let c = _mm_unpackhi_pi16(a, b);
+    let r = _mm_setzero_ps();
+    let r = cvtpi2ps(r, c);
+    let r = _mm_movelh_ps(r, r);
+    let c = _mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, c)
+}
+
+/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let c = _mm_unpackhi_pi16(a, b);
+    let r = _mm_setzero_ps();
+    let r = cvtpi2ps(r, c);
+    let r = _mm_movelh_ps(r, r);
+    let c = _mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, c)
+}
+
+/// Converts the two 32-bit signed integer values from each 64-bit vector
+/// operand of `[2 x i32]` into a 128-bit vector of `[4 x float]`.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi32x2_ps(a: __m64, b: __m64) -> __m128 {
+    let c = _mm_setzero_ps();
+    let c = _mm_cvtpi32_ps(c, b);
+    let c = _mm_movelh_ps(c, c);
+    _mm_cvtpi32_ps(c, a)
+}
+
+/// Conditionally copies the values from each 8-bit element in the first
+/// 64-bit integer vector operand to the specified memory location, as
+/// specified by the most significant bit in the corresponding element in the
+/// second 64-bit integer vector operand.
+///
+/// To minimize caching, the data is flagged as non-temporal
+/// (unlikely to be used again soon).
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(maskmovq))]
+pub unsafe fn _mm_maskmove_si64(a: __m64, mask: __m64, mem_addr: *mut i8) {
+    maskmovq(a, mask, mem_addr)
+}
+
+/// Conditionally copies the values from each 8-bit element in the first
+/// 64-bit integer vector operand to the specified memory location, as
+/// specified by the most significant bit in the corresponding element in the
+/// second 64-bit integer vector operand.
+///
+/// To minimize caching, the data is flagged as non-temporal
+/// (unlikely to be used again soon).
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(maskmovq))]
+pub unsafe fn _m_maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8) {
+    _mm_maskmove_si64(a, mask, mem_addr)
+}
+
+/// Extracts 16-bit element from a 64-bit vector of `[4 x i16]` and
+/// returns it, as specified by the immediate integer operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_extract_pi16(a: __m64, imm2: i32) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => {
+            pextrw(a, $imm2) as i32
+        };
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Extracts 16-bit element from a 64-bit vector of `[4 x i16]` and
+/// returns it, as specified by the immediate integer operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _m_pextrw(a: __m64, imm2: i32) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => {
+            pextrw(a, $imm2) as i32
+        };
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Copies data from the 64-bit vector of `[4 x i16]` to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand `n`.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_insert_pi16(a: __m64, d: i32, imm2: i32) -> __m64 {
+    macro_rules! call {
+        ($imm2:expr) => {
+            pinsrw(a, d, $imm2)
+        };
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Copies data from the 64-bit vector of `[4 x i16]` to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand `n`.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _m_pinsrw(a: __m64, d: i32, imm2: i32) -> __m64 {
+    macro_rules! call {
+        ($imm2:expr) => {
+            pinsrw(a, d, $imm2)
+        };
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create a 16-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _mm_movemask_pi8(a: __m64) -> i32 {
+    pmovmskb(a)
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create a 16-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _m_pmovmskb(a: __m64) -> i32 {
+    _mm_movemask_pi8(a)
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_shuffle_pi16(a: __m64, imm8: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            pshufw(a, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _m_pshufw(a: __m64, imm8: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            pshufw(a, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvttps_pi32(a: __m128) -> __m64 {
+    cvttps2pi(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvtt_ps2pi(a: __m128) -> __m64 {
+    _mm_cvttps_pi32(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi32(a: __m128) -> __m64 {
+    cvtps2pi(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvt_ps2pi(a: __m128) -> __m64 {
+    _mm_cvtps_pi32(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 16-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi16(a: __m128) -> __m64 {
+    let b = _mm_cvtps_pi32(a);
+    let a = _mm_movehl_ps(a, a);
+    let c = _mm_cvtps_pi32(a);
+    _mm_packs_pi32(b, c)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 8-bit integers, and returns theem in the lower 4 elements of the
+/// result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi8(a: __m128) -> __m64 {
+    let b = _mm_cvtps_pi16(a);
+    let c = _mm_setzero_si64();
+    _mm_packs_pi16(b, c)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::f32::NAN;
+    use std::mem::transmute;
+    use stdsimd_test::simd_test;
+    use test::black_box; // Used to inhibit constant-folding.
+
+    use core_arch::simd::*;
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ss() {
+        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ss(a, b);
+        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
+        let r = _mm_div_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_div_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ss(a);
+        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ps(a);
+        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ss(a);
+        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ps(a);
+        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ss(a);
+        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ps(a);
+        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_and_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0001));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_andnot_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0100));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_or_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0111));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_xor_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0110));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
+        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
+        let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+        assert_eq!(r, e);
+
+        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
+        let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+        assert_eq!(r2, e2);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) != b.extract(0)
+        let c1 = 0u32; // a.extract(0) != c.extract(0)
+        let d1 = !0u32; // a.extract(0) != d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ss() {
+        // TODO: This test is exactly the same as for _mm_cmpge_ss, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ss() {
+        // TODO: This test is exactly the same as for _mm_cmpgt_ss, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence
+        // of NaNs (signaling or quiet). If so, we should add tests for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ss() {
+        // TODO: This test is exactly the same as for _mm_cmple_ss, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ss() {
+        // TODO: This test is exactly the same as for _mm_cmplt_ss, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) ord b.extract(0)
+        let c1 = 0u32; // a.extract(0) ord c.extract(0)
+        let d1 = !0u32; // a.extract(0) ord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) unord b.extract(0)
+        let c1 = !0u32; // a.extract(0) unord c.extract(0)
+        let d1 = 0u32; // a.extract(0) unord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomigt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomige_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
+        // If one of the arguments is a quiet NaN `comieq_ss` should signal an
+        // Invalid Operation Exception while `ucomieq_ss` should not.
+        let aa = &[3.0f32, NAN, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, NAN, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+        let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            _MM_SET_EXCEPTION_STATE(0);
+            let r1 = _mm_comieq_ss(*black_box(&a), b);
+            let s1 = _MM_GET_EXCEPTION_STATE();
+
+            _MM_SET_EXCEPTION_STATE(0);
+            let r2 = _mm_ucomieq_ss(*black_box(&a), b);
+            let s2 = _MM_GET_EXCEPTION_STATE();
+
+            assert_eq!(
+                ee[i], r1,
+                "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r1, ee[i], i
+            );
+            assert_eq!(
+                ee[i], r2,
+                "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r2, ee[i], i
+            );
+            assert_eq!(
+                s1,
+                exc[i] * _MM_EXCEPT_INVALID,
+                "_mm_comieq_ss() set exception flags: {} (i={})",
+                s1,
+                i
+            );
+            assert_eq!(
+                s2,
+                0, // ucomieq_ss should not signal an exception
+                "_mm_ucomieq_ss() set exception flags: {} (i={})",
+                s2,
+                i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si32() {
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result = &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520];
+        for i in 0..inputs.len() {
+            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
+            let e = result[i];
+            let r = _mm_cvtss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si32() {
+        let inputs = &[
+            (42.0f32, 42i32),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, i32::min_value()),
+            (4.0e-10, 0),
+            (NAN, i32::min_value()),
+            (2147483500.1, 2147483520),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    pub unsafe fn test_mm_cvtsi32_ss() {
+        let inputs = &[
+            (4555i32, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi32_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    pub unsafe fn test_mm_cvtss_f32() {
+        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
+        assert_eq!(_mm_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ss() {
+        let r = _mm_set_ss(black_box(4.25));
+        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set1_ps() {
+        let r1 = _mm_set1_ps(black_box(4.25));
+        let r2 = _mm_set_ps1(black_box(4.25));
+        assert_eq!(get_m128(r1, 0), 4.25);
+        assert_eq!(get_m128(r1, 1), 4.25);
+        assert_eq!(get_m128(r1, 2), 4.25);
+        assert_eq!(get_m128(r1, 3), 4.25);
+        assert_eq!(get_m128(r2, 0), 4.25);
+        assert_eq!(get_m128(r2, 1), 4.25);
+        assert_eq!(get_m128(r2, 2), 4.25);
+        assert_eq!(get_m128(r2, 3), 4.25);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ps() {
+        let r = _mm_set_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq!(get_m128(r, 0), 4.0);
+        assert_eq!(get_m128(r, 1), 3.0);
+        assert_eq!(get_m128(r, 2), 2.0);
+        assert_eq!(get_m128(r, 3), 1.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setr_ps() {
+        let r = _mm_setr_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setzero_ps() {
+        let r = *black_box(&_mm_setzero_ps());
+        assert_eq_m128(r, _mm_set1_ps(0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle() {
+        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
+        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
+        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_shuffle_ps(a, b, 0b00_01_01_11);
+        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpackhi_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpackhi_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpacklo_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpacklo_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movehl_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movehl_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movelh_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movelh_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadh_pi() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
+        let p = x[..].as_ptr();
+        let r = _mm_loadh_pi(a, p as *const _);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadl_pi() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
+        let p = x[..].as_ptr();
+        let r = _mm_loadl_pi(a, p as *const _);
+        assert_eq_m128(r, _mm_setr_ps(5.0, 6.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ss() {
+        let a = 42.0f32;
+        let r = _mm_load_ss(&a as *const f32);
+        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load1_ps() {
+        let a = 42.0f32;
+        let r = _mm_load1_ps(&a as *const f32);
+        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = ((16 - unalignment) >> 2) as isize;
+            fixup = delta as f32;
+            p = p.offset(delta);
+        }
+
+        let r = _mm_load_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadu_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = vals.as_ptr().offset(3);
+        let r = _mm_loadu_ps(black_box(p));
+        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadr_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = ((16 - unalignment) >> 2) as isize;
+            fixup = delta as f32;
+            p = p.offset(delta);
+        }
+
+        let r = _mm_loadr_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeh_pi() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_storeh_pi(vals.as_mut_ptr() as *mut _, a);
+
+        assert_eq!(vals[0], 3.0);
+        assert_eq!(vals[1], 4.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storel_pi() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_storel_pi(vals.as_mut_ptr() as *mut _, a);
+
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 2.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ss() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ss(vals.as_mut_ptr().offset(1), a);
+
+        assert_eq!(vals[0], 0.0);
+        assert_eq!(vals[1], 1.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store1_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - (p as usize) & 0xf) >> 2;
+            p = p.offset(ofs as isize);
+        }
+
+        _mm_store1_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 1.0);
+        assert_eq!(vals[ofs + 2], 1.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - (p as usize) & 0xf) >> 2;
+            p = p.offset(ofs as isize);
+        }
+
+        _mm_store_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storer_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - (p as usize) & 0xf) >> 2;
+            p = p.offset(ofs as isize);
+        }
+
+        _mm_storer_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 4.0);
+        assert_eq!(vals[ofs + 1], 3.0);
+        assert_eq!(vals[ofs + 2], 2.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeu_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is *not* aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.offset(1);
+        }
+
+        _mm_storeu_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_move_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_move_ss(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(e, r);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movemask_ps() {
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0101);
+
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0111);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sfence() {
+        _mm_sfence();
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_1() {
+        let saved_csr = _mm_getcsr();
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
+
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        _mm_setcsr(saved_csr);
+
+        let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp); // first component is a denormalized f32
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_2() {
+        // Same as _mm_setcsr_1 test, but with opposite flag value.
+
+        let saved_csr = _mm_getcsr();
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
+
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        _mm_setcsr(saved_csr);
+
+        let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp); // first component is a denormalized f32
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_underflow() {
+        _MM_SET_EXCEPTION_STATE(0);
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
+
+        assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
+
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp);
+
+        let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
+        assert_eq!(underflow, true);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_TRANSPOSE4_PS() {
+        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
+        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
+
+        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
+
+        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
+        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
+        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
+        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        pub data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_stream_ps() {
+        let a = _mm_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m128(a, i));
+        }
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_stream_pi() {
+        let a = transmute(i8x8::new(0, 0, 0, 0, 0, 0, 0, 7));
+        let mut mem = ::std::boxed::Box::<__m64>::new(transmute(i8x8::splat(1)));
+        _mm_stream_pi(&mut *mem as *mut _ as *mut _, a);
+        assert_eq_m64(a, *mem);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_max_pi16() {
+        let a = _mm_setr_pi16(-1, 6, -3, 8);
+        let b = _mm_setr_pi16(5, -2, 7, -4);
+        let r = _mm_setr_pi16(5, 6, 7, 8);
+
+        assert_eq_m64(r, _mm_max_pi16(a, b));
+        assert_eq_m64(r, _m_pmaxsw(a, b));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_max_pu8() {
+        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
+        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
+        let r = _mm_setr_pi8(5, 6, 7, 8, 5, 6, 7, 8);
+
+        assert_eq_m64(r, _mm_max_pu8(a, b));
+        assert_eq_m64(r, _m_pmaxub(a, b));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_min_pi16() {
+        let a = _mm_setr_pi16(-1, 6, -3, 8);
+        let b = _mm_setr_pi16(5, -2, 7, -4);
+        let r = _mm_setr_pi16(-1, -2, -3, -4);
+
+        assert_eq_m64(r, _mm_min_pi16(a, b));
+        assert_eq_m64(r, _m_pminsw(a, b));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_min_pu8() {
+        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
+        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
+        let r = _mm_setr_pi8(2, 2, 3, 4, 2, 2, 3, 4);
+
+        assert_eq_m64(r, _mm_min_pu8(a, b));
+        assert_eq_m64(r, _m_pminub(a, b));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_mulhi_pu16() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _mm_mulhi_pu16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(15));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_mullo_pi16() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _mm_mullo_pi16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(17960));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_m_pmulhuw() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _m_pmulhuw(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(15));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_avg_pu8() {
+        let (a, b) = (_mm_set1_pi8(3), _mm_set1_pi8(9));
+        let r = _mm_avg_pu8(a, b);
+        assert_eq_m64(r, _mm_set1_pi8(6));
+
+        let r = _m_pavgb(a, b);
+        assert_eq_m64(r, _mm_set1_pi8(6));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_avg_pu16() {
+        let (a, b) = (_mm_set1_pi16(3), _mm_set1_pi16(9));
+        let r = _mm_avg_pu16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(6));
+
+        let r = _m_pavgw(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(6));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_sad_pu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_pi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_pi8(0, 0, 0, 0, 2, 1, 2, 1);
+        let r = _mm_sad_pu8(a, b);
+        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
+
+        let r = _m_psadbw(a, b);
+        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpi32_ps() {
+        let a = _mm_setr_ps(0., 0., 3., 4.);
+        let b = _mm_setr_pi32(1, 2);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi32_ps(a, b);
+        assert_eq_m128(r, expected);
+
+        let r = _mm_cvt_pi2ps(a, b);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpi16_ps() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi16_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpu16_ps() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpu16_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpi8_ps() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi8_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpu8_ps() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpu8_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtpi32x2_ps() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi32x2_ps(a, b);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_maskmove_si64() {
+        let a = _mm_set1_pi8(9);
+        let mask = _mm_setr_pi8(0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0);
+        let mut r = _mm_set1_pi8(0);
+        _mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8);
+        let e = _mm_setr_pi8(0, 0, 9, 0, 0, 0, 0, 0);
+        assert_eq_m64(r, e);
+
+        let mut r = _mm_set1_pi8(0);
+        _m_maskmovq(a, mask, &mut r as *mut _ as *mut i8);
+        assert_eq_m64(r, e);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_extract_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_extract_pi16(a, 0);
+        assert_eq!(r, 1);
+        let r = _mm_extract_pi16(a, 1);
+        assert_eq!(r, 2);
+
+        let r = _m_pextrw(a, 1);
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_insert_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_insert_pi16(a, 0, 0b0);
+        let expected = _mm_setr_pi16(0, 2, 3, 4);
+        assert_eq_m64(r, expected);
+        let r = _mm_insert_pi16(a, 0, 0b10);
+        let expected = _mm_setr_pi16(1, 2, 0, 4);
+        assert_eq_m64(r, expected);
+
+        let r = _m_pinsrw(a, 0, 0b10);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_movemask_pi8() {
+        let a = _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
+        let r = _mm_movemask_pi8(a);
+        assert_eq!(r, 0b10001);
+
+        let r = _m_pmovmskb(a);
+        assert_eq!(r, 0b10001);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_shuffle_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_shuffle_pi16(a, 0b00_01_01_11);
+        let expected = _mm_setr_pi16(4, 2, 2, 1);
+        assert_eq_m64(r, expected);
+
+        let r = _m_pshufw(a, 0b00_01_01_11);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtps_pi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi32(1, 2);
+
+        assert_eq_m64(r, _mm_cvtps_pi32(a));
+        assert_eq_m64(r, _mm_cvt_ps2pi(a));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvttps_pi32() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi32(7, 2);
+
+        assert_eq_m64(r, _mm_cvttps_pi32(a));
+        assert_eq_m64(r, _mm_cvtt_ps2pi(a));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtps_pi16() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi16(7, 2, 3, 4);
+        assert_eq_m64(r, _mm_cvtps_pi16(a));
+    }
+
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_cvtps_pi8() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi8(7, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m64(r, _mm_cvtps_pi8(a));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
new file mode 100644
index 00000000000..a21743ec4ae
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -0,0 +1,5253 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use intrinsics;
+use mem;
+use ptr;
+
+/// Provide a hint to the processor that the code sequence is a spin-wait loop.
+///
+/// This can help improve the performance and power consumption of spin-wait
+/// loops.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_pause)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pause))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_pause() {
+    pause()
+}
+
+/// Invalidate and flush the cache line that contains `p` from all levels of
+/// the cache hierarchy.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clflush)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(clflush))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_clflush(p: *mut u8) {
+    clflush(p)
+}
+
+/// Perform a serializing operation on all load-from-memory instructions
+/// that were issued prior to this instruction.
+///
+/// Guarantees that every load instruction that precedes, in program order, is
+/// globally visible before any load instruction which follows the fence in
+/// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(lfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lfence() {
+    lfence()
+}
+
+/// Perform a serializing operation on all load-from-memory and store-to-memory
+/// instructions that were issued prior to this instruction.
+///
+/// Guarantees that every memory access that precedes, in program order, the
+/// memory fence instruction is globally visible before any memory instruction
+/// which follows the fence in program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mfence() {
+    mfence()
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Add packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Add packed 64-bit integers in `a` and "b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Add packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(paddsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Add packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(paddsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(paddsub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(paddsuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Average packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Average packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Multiply and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Return the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epu32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psubsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psubsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psubusb(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psubusw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Shift `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
+    _mm_slli_si128_impl(a, imm8)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_slli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
+    let (zero, imm8) = (_mm_set1_epi8(0).as_i8x16(), imm8 as u32);
+    let a = a.as_i8x16();
+    macro_rules! shuffle {
+        ($shift:expr) => {
+            simd_shuffle16::<i8x16, i8x16>(
+                zero,
+                a,
+                [
+                    16 - $shift,
+                    17 - $shift,
+                    18 - $shift,
+                    19 - $shift,
+                    20 - $shift,
+                    21 - $shift,
+                    22 - $shift,
+                    23 - $shift,
+                    24 - $shift,
+                    25 - $shift,
+                    26 - $shift,
+                    27 - $shift,
+                    28 - $shift,
+                    29 - $shift,
+                    30 - $shift,
+                    31 - $shift,
+                ],
+            )
+        };
+    }
+    let x = match imm8 {
+        0 => shuffle!(0),
+        1 => shuffle!(1),
+        2 => shuffle!(2),
+        3 => shuffle!(3),
+        4 => shuffle!(4),
+        5 => shuffle!(5),
+        6 => shuffle!(6),
+        7 => shuffle!(7),
+        8 => shuffle!(8),
+        9 => shuffle!(9),
+        10 => shuffle!(10),
+        11 => shuffle!(11),
+        12 => shuffle!(12),
+        13 => shuffle!(13),
+        14 => shuffle!(14),
+        15 => shuffle!(15),
+        _ => shuffle!(16),
+    };
+    mem::transmute(x)
+}
+
+/// Shift `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
+    _mm_slli_si128_impl(a, imm8)
+}
+
+/// Shift `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
+    _mm_srli_si128_impl(a, imm8)
+}
+
+/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw, imm8 = 7))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(pslliw(a.as_i16x8(), imm8))
+}
+
+/// Shift packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psllw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld, imm8 = 7))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psllid(a.as_i32x4(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(pslld(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq, imm8 = 7))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(pslliq(a.as_i64x2(), imm8))
+}
+
+/// Shift packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psllq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psraiw(a.as_i16x8(), imm8))
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psraw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psraid(a.as_i32x4(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrad(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
+    _mm_srli_si128_impl(a, imm8)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_srli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
+    let (zero, imm8) = (_mm_set1_epi8(0).as_i8x16(), imm8 as u32);
+    let a = a.as_i8x16();
+    macro_rules! shuffle {
+        ($shift:expr) => {
+            simd_shuffle16(
+                a,
+                zero,
+                [
+                    0 + $shift,
+                    1 + $shift,
+                    2 + $shift,
+                    3 + $shift,
+                    4 + $shift,
+                    5 + $shift,
+                    6 + $shift,
+                    7 + $shift,
+                    8 + $shift,
+                    9 + $shift,
+                    10 + $shift,
+                    11 + $shift,
+                    12 + $shift,
+                    13 + $shift,
+                    14 + $shift,
+                    15 + $shift,
+                ],
+            )
+        };
+    }
+    let x: i8x16 = match imm8 {
+        0 => shuffle!(0),
+        1 => shuffle!(1),
+        2 => shuffle!(2),
+        3 => shuffle!(3),
+        4 => shuffle!(4),
+        5 => shuffle!(5),
+        6 => shuffle!(6),
+        7 => shuffle!(7),
+        8 => shuffle!(8),
+        9 => shuffle!(9),
+        10 => shuffle!(10),
+        11 => shuffle!(11),
+        12 => shuffle!(12),
+        13 => shuffle!(13),
+        14 => shuffle!(14),
+        15 => shuffle!(15),
+        _ => shuffle!(16),
+    };
+    mem::transmute(x)
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psrliw(a.as_i16x8(), imm8))
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld, imm8 = 8))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psrlid(a.as_i32x4(), imm8))
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrld(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
+    mem::transmute(psrliq(a.as_i64x2(), imm8))
+}
+
+/// Shift packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    mem::transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_and(a, b)
+}
+
+/// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_and(simd_xor(_mm_set1_epi8(-1), a), b)
+}
+
+/// Compute the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_or(a, b)
+}
+
+/// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_xor(a, b)
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Convert the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    let a = a.as_i32x4();
+    simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
+}
+
+/// Return `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    simd_insert(a, 0, b as f64)
+}
+
+/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    cvtdq2ps(a.as_i32x4())
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+    mem::transmute(cvtps2dq(a))
+}
+
+/// Return a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    mem::transmute(i32x4::new(a, 0, 0, 0))
+}
+
+/// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    simd_extract(a.as_i32x4(), 0)
+}
+
+/// Set packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    mem::transmute(i64x2::new(e0, e1))
+}
+
+/// Set packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    mem::transmute(i32x4::new(e0, e1, e2, e3))
+}
+
+/// Set packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+
+/// Set packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    #[rustfmt::skip]
+    mem::transmute(i8x16::new(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ))
+}
+
+/// Broadcast 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+
+/// Broadcast 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+
+/// Broadcast 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcast 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Set packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+
+/// Set packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Set packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    #[rustfmt::skip]
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_si128() -> __m128i {
+    _mm_set1_epi64x(0)
+}
+
+/// Load 64-bit integer from memory into first element of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME movsd on windows
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
+    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
+}
+
+/// Load 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
+    *mem_addr
+}
+
+/// Load 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
+    let mut dst: __m128i = _mm_undefined_si128();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m128i as *mut u8,
+        mem::size_of::<__m128i>(),
+    );
+    dst
+}
+
+/// Conditionally store 8-bit integer elements from `a` into memory using
+/// `mask`.
+///
+/// Elements are not stored when the highest bit is not set in the
+/// corresponding element.
+///
+/// `mem_addr` should correspond to a 128-bit memory location and does not need
+/// to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maskmovdqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
+    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
+}
+
+/// Store 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
+    *mem_addr = a;
+}
+
+/// Store 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
+    storeudq(mem_addr as *mut i8, a);
+}
+
+/// Store the lower 64-bit integer `a` to a memory location.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME mov on windows, movlps on i686
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
+    ptr::copy_nonoverlapping(&a as *const _ as *const u8, mem_addr as *mut u8, 8);
+}
+
+/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
+    ::intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Stores a 32-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
+    ::intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Return a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME movd on windows, movd on i686
+#[cfg_attr(all(test, not(windows), target_arch = "x86_64"), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
+    mem::transmute(r)
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packsswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Return the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pextrw, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
+    simd_extract::<_, i16>(a.as_i16x8(), (imm8 & 7) as u32) as i32
+}
+
+/// Return a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16))
+}
+
+/// Return a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    pmovmskb(a.as_i8x16())
+}
+
+/// Shuffle 32-bit integers in `a` using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufd, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
+    // simd_shuffleX requires that its selector parameter be made up of
+    // constant values, but we can't enforce that here. In spirit, we need
+    // to write a `match` on all possible values of a byte, and for each value,
+    // hard-code the correct `simd_shuffleX` call using only constants. We
+    // then hope for LLVM to do the rest.
+    //
+    // Of course, that's... awful. So we try to use macros to do it for us.
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x4();
+
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let x: i32x4 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(x)
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of `a` using the control in
+/// `imm8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
+    // See _mm_shuffle_epi32.
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i16x8();
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle8(a, a, [0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4])
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let x: i16x8 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(x)
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of `a` using the control in
+/// `imm8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
+    // See _mm_shuffle_epi32.
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i16x8();
+
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
+        };
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        };
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        };
+    }
+    let x: i16x8 = match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    };
+    mem::transmute(x)
+}
+
+/// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i8x16, _>(simd_shuffle16(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+    ))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    mem::transmute::<i16x8, _>(x)
+}
+
+/// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3]))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i8x16, _>(simd_shuffle16(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+    ))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    mem::transmute::<i16x8, _>(x)
+}
+
+/// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+}
+
+/// Return a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_add(a, b)
+}
+
+/// Return a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_div(a, b)
+}
+
+/// Return a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+    maxsd(a, b)
+}
+
+/// Return a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+    maxpd(a, b)
+}
+
+/// Return a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+    minsd(a, b)
+}
+
+/// Return a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+    minpd(a, b)
+}
+
+/// Return a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_mul(a, b)
+}
+
+/// Return a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
+}
+
+/// Return a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+    sqrtpd(a)
+}
+
+/// Return a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_sub(a, b)
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(_mm_and_si128(a, b))
+}
+
+/// Compute the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(_mm_andnot_si128(a, b))
+}
+
+/// Compute the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(_mm_or_si128(a, b))
+}
+
+/// Compute the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(_mm_xor_si128(a, b))
+}
+
+/// Return a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 0)
+}
+
+/// Return a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 1)
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 2)
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Return a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 7)
+}
+
+/// Return a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 3)
+}
+
+/// Return a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 4)
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 5)
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 6)
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Return a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Compare corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 0)
+}
+
+/// Compare corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 1)
+}
+
+/// Compare corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 2)
+}
+
+/// Compare corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmplt_pd(b, a)
+}
+
+/// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmple_pd(b, a)
+}
+
+/// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 7)
+}
+
+/// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 3)
+}
+
+/// Compare corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 4)
+}
+
+/// Compare corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 5)
+}
+
+/// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 6)
+}
+
+/// Compare corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnlt_pd(b, a)
+}
+
+/// Compare corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnle_pd(b, a)
+}
+
+/// Compare the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+    comieqsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+    comiltsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+    comilesd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+    comigtsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+    comigesd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+    comineqsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomieqsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomiltsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomilesd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomigtsd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomigesd(a, b)
+}
+
+/// Compare the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomineqsd(a, b)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in "a" to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    cvtpd2ps(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    cvtps2pd(a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+    mem::transmute(cvtpd2dq(a))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+    cvtsd2si(a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copy the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+    cvtsd2ss(a, b)
+}
+
+/// Return the lower double-precision (64-bit) floating-point element of "a".
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+    simd_extract(a, 0)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copy the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtss2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+    cvtss2sd(a, b)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+    mem::transmute(cvttpd2dq(a))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+    cvttsd2si(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+    mem::transmute(cvttps2dq(a))
+}
+
+/// Copy double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+
+/// Broadcast double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Broadcast double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    __m128d(b, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_pd() -> __m128d {
+    _mm_set_pd(0.0, 0.0)
+}
+
+/// Return a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
+    movmskpd(a)
+}
+
+/// Load 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
+    *(mem_addr as *const __m128d)
+}
+
+/// Loads a 64-bit double-precision value to the low element of a
+/// 128-bit integer vector and clears the upper element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, 0.)
+}
+
+/// Loads a double-precision value into the high-order bits of a 128-bit
+/// vector of `[2 x double]`. The low-order bits are copied from the low-order
+/// bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(simd_extract(a, 0), *mem_addr)
+}
+
+/// Loads a double-precision value into the low-order bits of a 128-bit
+/// vector of `[2 x double]`. The high-order bits are copied from the
+/// high-order bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, simd_extract(a, 1))
+}
+
+/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
+/// aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 0)
+}
+
+/// Store 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
+/// on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
+    *(mem_addr as *mut __m128d) = a;
+}
+
+/// Store 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
+    storeupd(mem_addr as *mut i8, a);
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Store 2 double-precision (64-bit) floating-point elements from `a` into
+/// memory in reverse order.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
+pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2(a, a, [1, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 1);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 0);
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
+    let d = *mem_addr;
+    _mm_setr_pd(d, d)
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Load 2 double-precision (64-bit) floating-point elements from memory into
+/// the returned vector in reverse order. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movapd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
+    let a = _mm_load_pd(mem_addr);
+    simd_shuffle2(a, a, [1, 0])
+}
+
+/// Load 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
+    let mut dst = _mm_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m128d as *mut u8,
+        mem::size_of::<__m128d>(),
+    );
+    dst
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    match imm8 & 0b11 {
+        0b00 => simd_shuffle2(a, b, [0, 2]),
+        0b01 => simd_shuffle2(a, b, [1, 2]),
+        0b10 => simd_shuffle2(a, b, [0, 3]),
+        _ => simd_shuffle2(a, b, [1, 3]),
+    }
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_setr_pd(simd_extract(b, 0), simd_extract(a, 1))
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    mem::transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    mem::transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
+    mem::transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
+    mem::transmute(a)
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    mem::transmute(a)
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    mem::transmute(a)
+}
+
+/// Return vector of type __m128d with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_pd() -> __m128d {
+    // FIXME: this function should return MaybeUninit<__m128d>
+    mem::MaybeUninit::<__m128d>::uninitialized().into_inner()
+}
+
+/// Return vector of type __m128i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_si128() -> __m128i {
+    // FIXME: this function should return MaybeUninit<__m128i>
+    mem::MaybeUninit::<__m128i>::uninitialized().into_inner()
+}
+
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
+/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
+/// input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_shuffle2(a, b, [1, 3])
+}
+
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_shuffle2(a, b, [0, 2])
+}
+
+/// Adds two signed or unsigned 64-bit integer values, returning the
+/// lower 64 bits of the sum.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(paddq))]
+pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
+    paddq(a, b)
+}
+
+/// Multiplies 32-bit unsigned integer values contained in the lower bits
+/// of the two 64-bit integer vectors and returns the 64-bit unsigned
+/// product.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(pmuludq))]
+pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
+    pmuludq2(a, b)
+}
+
+/// Subtracts signed or unsigned 64-bit integer values and writes the
+/// difference to the corresponding bits in the destination.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(psubq))]
+pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
+    psubq(a, b)
+}
+
+/// Converts the two signed 32-bit integer elements of a 64-bit vector of
+/// `[2 x i32]` into two double-precision floating-point values, returned in a
+/// 128-bit vector of `[2 x double]`.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2pd))]
+pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
+    cvtpi2pd(a)
+}
+
+/// Initializes both 64-bit values in a 128-bit vector of `[2 x i64]` with
+/// the specified 64-bit integer values.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(e1), mem::transmute(e0))
+}
+
+/// Initializes both values in a 128-bit vector of `[2 x i64]` with the
+/// specified 64-bit value.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(a), mem::transmute(a))
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(e0), mem::transmute(e1))
+}
+
+/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+/// integer.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
+// instr?
+pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
+    mem::transmute(simd_extract::<_, i64>(a.as_i64x2(), 0))
+}
+
+/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+/// upper bits.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
+// instr?
+pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
+    _mm_set_epi64x(0, mem::transmute(a))
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of `[2 x double]` into two signed 32-bit integer values,
+/// returned in a 64-bit vector of `[2 x i32]`.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvtpd2pi))]
+pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
+    cvtpd2pi(a)
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of `[2 x double]` into two signed 32-bit integer values,
+/// returned in a 64-bit vector of `[2 x i32]`.
+/// If the result of either conversion is inexact, the result is truncated
+/// (rounded towards zero) regardless of the current MXCSR setting.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvttpd2pi))]
+pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
+    cvttpd2pi(a)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse2.pause"]
+    fn pause();
+    #[link_name = "llvm.x86.sse2.clflush"]
+    fn clflush(p: *mut u8);
+    #[link_name = "llvm.x86.sse2.lfence"]
+    fn lfence();
+    #[link_name = "llvm.x86.sse2.mfence"]
+    fn mfence();
+    #[link_name = "llvm.x86.sse2.padds.b"]
+    fn paddsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse2.padds.w"]
+    fn paddsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.paddus.b"]
+    fn paddsub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.paddus.w"]
+    fn paddsuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pavg.b"]
+    fn pavgb(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pavg.w"]
+    fn pavgw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmadd.wd"]
+    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.x86.sse2.pmaxs.w"]
+    fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmaxu.b"]
+    fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmins.w"]
+    fn pminsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pminu.b"]
+    fn pminub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmulh.w"]
+    fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmulhu.w"]
+    fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmulu.dq"]
+    fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
+    #[link_name = "llvm.x86.sse2.psad.bw"]
+    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
+    #[link_name = "llvm.x86.sse2.psubs.b"]
+    fn psubsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse2.psubs.w"]
+    fn psubsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psubus.b"]
+    fn psubusb(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.psubus.w"]
+    fn psubusw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pslli.w"]
+    fn pslliw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psll.w"]
+    fn psllw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pslli.d"]
+    fn psllid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psll.d"]
+    fn pslld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.pslli.q"]
+    fn pslliq(a: i64x2, imm8: i32) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psll.q"]
+    fn psllq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psrai.w"]
+    fn psraiw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psra.w"]
+    fn psraw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrai.d"]
+    fn psraid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psra.d"]
+    fn psrad(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrli.w"]
+    fn psrliw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrl.w"]
+    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrli.d"]
+    fn psrlid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrl.d"]
+    fn psrld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrli.q"]
+    fn psrliq(a: i64x2, imm8: i32) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psrl.q"]
+    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.cvtdq2ps"]
+    fn cvtdq2ps(a: i32x4) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtps2dq"]
+    fn cvtps2dq(a: __m128) -> i32x4;
+    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
+    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
+    #[link_name = "llvm.x86.sse2.packsswb.128"]
+    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.x86.sse2.packssdw.128"]
+    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.x86.sse2.packuswb.128"]
+    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmovmskb.128"]
+    fn pmovmskb(a: i8x16) -> i32;
+    #[link_name = "llvm.x86.sse2.max.sd"]
+    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.max.pd"]
+    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.sd"]
+    fn minsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.pd"]
+    fn minpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.sqrt.sd"]
+    fn sqrtsd(a: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.sqrt.pd"]
+    fn sqrtpd(a: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.comieq.sd"]
+    fn comieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comilt.sd"]
+    fn comiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comile.sd"]
+    fn comilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comigt.sd"]
+    fn comigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comige.sd"]
+    fn comigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comineq.sd"]
+    fn comineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
+    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
+    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomile.sd"]
+    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
+    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomige.sd"]
+    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
+    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.movmsk.pd"]
+    fn movmskpd(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtpd2ps"]
+    fn cvtpd2ps(a: __m128d) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtps2pd"]
+    fn cvtps2pd(a: __m128) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
+    fn cvtpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvtsd2si"]
+    fn cvtsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
+    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtss2sd"]
+    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
+    fn cvttpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvttsd2si"]
+    fn cvttsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvttps2dq"]
+    fn cvttps2dq(a: __m128) -> i32x4;
+    #[link_name = "llvm.x86.sse2.storeu.dq"]
+    fn storeudq(mem_addr: *mut i8, a: __m128i);
+    #[link_name = "llvm.x86.sse2.storeu.pd"]
+    fn storeupd(mem_addr: *mut i8, a: __m128d);
+    #[link_name = "llvm.x86.mmx.padd.q"]
+    fn paddq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmulu.dq"]
+    fn pmuludq2(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psub.q"]
+    fn psubq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtpi2pd"]
+    fn cvtpi2pd(a: __m64) -> __m128d;
+    #[link_name = "llvm.x86.sse.cvtpd2pi"]
+    fn cvtpd2pi(a: __m128d) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttpd2pi"]
+    fn cvttpd2pi(a: __m128d) -> __m64;
+}
+
+#[cfg(test)]
+mod tests {
+    use std::f32;
+    use std::f64::{self, NAN};
+    use std::i32;
+    use std::mem::{self, transmute};
+
+    use core_arch::simd::*;
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+    use test::black_box; // Used to inhibit constant-folding.
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_pause() {
+        _mm_pause();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_clflush() {
+        let x = 0;
+        _mm_clflush(&x as *const _ as *mut u8);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_lfence() {
+        _mm_lfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mfence() {
+        _mm_mfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8_overflow() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_add_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-128));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_add_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_add_epi32(a, b);
+        let e = _mm_setr_epi32(4, 6, 8, 10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_add_epi64(a, b);
+        let e = _mm_setr_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8_saturate() {
+        let a = _mm_set1_epi8(!0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epu16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16_saturate() {
+        let a = _mm_set1_epi16(!0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu8() {
+        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
+        let r = _mm_avg_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu16() {
+        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
+        let r = _mm_avg_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_madd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm_madd_epi16(a, b);
+        let e = _mm_setr_epi32(29, 81, 149, 233);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_max_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_max_epu8(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_min_epi16(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_min_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mulhi_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-16));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epu16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
+        let r = _mm_mulhi_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(15));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mullo_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mullo_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-17960));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_epu32() {
+        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
+        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
+        let r = _mm_mul_epu32(a, b);
+        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sad_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
+        let r = _mm_sad_epu8(a, b);
+        let e = _mm_setr_epi64x(1020, 614);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
+        let r = _mm_sub_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
+        let r = _mm_sub_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi32() {
+        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
+        let r = _mm_sub_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi64() {
+        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
+        let r = _mm_sub_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8_saturate() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16_saturate() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128(a, 1);
+        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128(a, 15);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128(a, 16);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128(a, -1);
+        assert_eq_m128i(_mm_set1_epi8(0), r);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128(a, -0x80000000);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi16(
+            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
+        );
+        let r = _mm_slli_epi16(a, 4);
+
+        #[rustfmt::skip]
+        let e = _mm_setr_epi16(
+            0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi16() {
+        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
+        let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi32() {
+        let r = _mm_slli_epi32(_mm_set1_epi32(0xFFFF), 4);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi32() {
+        let a = _mm_set1_epi32(0xFFFF);
+        let b = _mm_setr_epi32(4, 0, 0, 0);
+        let r = _mm_sll_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi64() {
+        let r = _mm_slli_epi64(_mm_set1_epi64x(0xFFFFFFFF), 4);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi64() {
+        let a = _mm_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm_sll_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi16() {
+        let r = _mm_srai_epi16(_mm_set1_epi16(-1), 1);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_sra_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi32() {
+        let r = _mm_srai_epi32(_mm_set1_epi32(-1), 1);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_setr_epi32(1, 0, 0, 0);
+        let r = _mm_sra_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128(a, 1);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128(a, 15);
+        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128(a, 16);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128(a, -1);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128(a, -0x80000000);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi16(
+            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
+        );
+        let r = _mm_srli_epi16(a, 4);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi16(
+            0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi16() {
+        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
+        let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi32() {
+        let r = _mm_srli_epi32(_mm_set1_epi32(0xFFFF), 4);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi32() {
+        let a = _mm_set1_epi32(0xFFFF);
+        let b = _mm_setr_epi32(4, 0, 0, 0);
+        let r = _mm_srl_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi64() {
+        let r = _mm_srli_epi64(_mm_set1_epi64x(0xFFFFFFFF), 4);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi64() {
+        let a = _mm_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm_srl_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_and_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_andnot_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_or_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_xor_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi8(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi16(a, b);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(3, 2, 2, 0);
+        let r = _mm_cmpeq_epi32(a, b);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi8() {
+        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi8(0);
+        let r = _mm_cmpgt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi16() {
+        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi16(0);
+        let r = _mm_cmpgt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi32() {
+        let a = _mm_set_epi32(5, 0, 0, 0);
+        let b = _mm_set1_epi32(0);
+        let r = _mm_cmpgt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi8() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi32() {
+        let a = _mm_set1_epi32(0);
+        let b = _mm_set_epi32(5, 0, 0, 0);
+        let r = _mm_cmplt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_pd() {
+        let a = _mm_set_epi32(35, 25, 15, 5);
+        let r = _mm_cvtepi32_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi32_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_ps() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_epi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_si128() {
+        let r = _mm_cvtsi32_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si32() {
+        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi64x() {
+        let r = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi32() {
+        let r = _mm_set_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi16() {
+        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi64x() {
+        let r = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, _mm_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi32() {
+        let r = _mm_set1_epi32(1);
+        assert_eq_m128i(r, _mm_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi16() {
+        let r = _mm_set1_epi16(1);
+        assert_eq_m128i(r, _mm_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi8() {
+        let r = _mm_set1_epi8(1);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi32() {
+        let r = _mm_setr_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi16() {
+        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_si128() {
+        let r = _mm_setzero_si128();
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_epi64() {
+        let a = _mm_setr_epi64x(6, 5);
+        let r = _mm_loadl_epi64(&a as *const _);
+        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_load_si128(&a as *const _ as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_loadu_si128(&a as *const _ as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_maskmoveu_si128() {
+        let a = _mm_set1_epi8(9);
+        #[rustfmt::skip]
+        let mask = _mm_set_epi8(
+            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        let mut r = _mm_set1_epi8(0);
+        _mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
+        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_store_si128(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storeu_si128(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_epi64() {
+        let a = _mm_setr_epi64x(2, 9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storel_epi64(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si128() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = _mm_undefined_si128();
+        _mm_stream_si128(&mut r as *mut _, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si32() {
+        let a: i32 = 7;
+        let mut mem = ::std::boxed::Box::<i32>::new(-1);
+        _mm_stream_si32(&mut *mem as *mut i32, a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_epi64() {
+        let a = _mm_setr_epi64x(5, 6);
+        let r = _mm_move_epi64(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi16() {
+        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
+        let r = _mm_packs_epi16(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi32() {
+        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
+        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
+        let r = _mm_packs_epi32(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packus_epi16() {
+        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
+        let r = _mm_packus_epi16(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_extract_epi16() {
+        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm_extract_epi16(a, 0);
+        let r2 = _mm_extract_epi16(a, 11);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_insert_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_insert_epi16(a, 9, 0);
+        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
+            0b0101, 0b1111_0000u8 as i8, 0, 0,
+            0, 0, 0b1111_0000u8 as i8, 0b0101,
+            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
+        );
+        let r = _mm_movemask_epi8(a);
+        assert_eq!(r, 0b10100100_00100101);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_epi32() {
+        let a = _mm_setr_epi32(5, 10, 15, 20);
+        let r = _mm_shuffle_epi32(a, 0b00_01_01_11);
+        let e = _mm_setr_epi32(20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflehi_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
+        let r = _mm_shufflehi_epi16(a, 0b00_01_01_11);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflelo_epi16() {
+        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
+        let r = _mm_shufflelo_epi16(a, 0b00_01_01_11);
+        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpackhi_epi16(a, b);
+        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpackhi_epi32(a, b);
+        let e = _mm_setr_epi32(2, 6, 3, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpackhi_epi64(a, b);
+        let e = _mm_setr_epi64x(1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 16, 1, 17, 2, 18, 3, 19,
+            4, 20, 5, 21, 6, 22, 7, 23,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpacklo_epi16(a, b);
+        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpacklo_epi32(a, b);
+        let e = _mm_setr_epi32(0, 4, 1, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpacklo_epi64(a, b);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sqrt_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_pd() {
+        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
+        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_and_pd(a, b);
+        let e = transmute(u64x2::splat(1));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_andnot_pd(a, b);
+        let e = transmute(u64x2::splat(2));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_or_pd(a, b);
+        let e = transmute(u64x2::splat(7));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_xor_pd(a, b);
+        let e = transmute(u64x2::splat(6));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_pd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_pd() {
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
+        assert_eq!(r, 0b01);
+
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
+        assert_eq!(r, 0b11);
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        data: [f64; 4],
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd() {
+        let mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_load_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_sd() {
+        let a = 1.;
+        let expected = _mm_setr_pd(a, 0.);
+        let r = _mm_load_sd(&a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadh_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
+        let r = _mm_loadh_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(3., get_m128d(a, 1));
+        let r = _mm_loadl_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_pd() {
+        #[repr(align(128))]
+        struct Memory {
+            pub data: [f64; 2],
+        }
+        let a = _mm_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 2] };
+
+        _mm_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..2 {
+            assert_eq!(mem.data[i], get_m128d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_sd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_store_sd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 2.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeu_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is *not* aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.offset(1);
+        }
+
+        _mm_storeu_pd(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store1_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store1_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd1() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd1(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storer_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_storer_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 2.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeh_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storeh_pd(&mut dest, a);
+        assert_eq!(dest, get_m128d(a, 1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storel_pd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadr_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_loadr_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let mut d = vals.as_ptr();
+
+        // make sure d is not aligned to 16-byte boundary
+        let mut offset = 0;
+        if (d as usize) & 0xf == 0 {
+            offset = 1;
+            d = d.offset(offset as isize);
+        }
+
+        let r = _mm_loadu_pd(d);
+        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_ps() {
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
+        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_pd() {
+        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
+
+        let r = _mm_cvtps_pd(_mm_setr_ps(
+            f32::MAX,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            f32::MIN,
+        ));
+        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_epi32() {
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si32() {
+        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i32::MIN);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_ss() {
+        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
+        let b = _mm_setr_pd(2.0, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
+
+        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
+        let b = _mm_setr_pd(f64::INFINITY, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(
+            r,
+            _mm_setr_ps(
+                f32::INFINITY,
+                f32::NEG_INFINITY,
+                f32::MAX,
+                f32::NEG_INFINITY,
+            ),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_f64() {
+        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
+        assert_eq!(r, -1.1);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtss_sd() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
+
+        let a = _mm_setr_pd(-1.1, f64::INFINITY);
+        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttpd_epi32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, -1);
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttps_epi32() {
+        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
+
+        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_sd() {
+        let r = _mm_set_sd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_pd() {
+        let r = _mm_set1_pd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd1() {
+        let r = _mm_set_pd1(-2.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd() {
+        let r = _mm_set_pd(1.0_f64, 5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_pd() {
+        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_pd() {
+        let r = _mm_setzero_pd();
+        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load1_pd() {
+        let d = -5.0;
+        let r = _mm_load1_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd1() {
+        let d = -5.0;
+        let r = _mm_load_pd1(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpackhi_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpacklo_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(1., 3.);
+        let r = _mm_shuffle_pd(a, b, 0);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(3., 2.);
+        let r = _mm_move_sd(a, b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_ps() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castpd_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_si128() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_epi64x(0);
+        let r = _mm_castpd_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_pd() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castps_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_si128() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_epi32(0);
+        let r = _mm_castps_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_pd() {
+        let a = _mm_set1_epi64x(0);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castsi128_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_ps() {
+        let a = _mm_set1_epi32(0);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castsi128_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_add_si64() {
+        let a = 1i64;
+        let b = 2i64;
+        let expected = 3i64;
+        let r = _mm_add_si64(mem::transmute(a), mem::transmute(b));
+        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_mul_su32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(3, 4);
+        let expected = 3u64;
+        let r = _mm_mul_su32(a, b);
+        assert_eq_m64(r, mem::transmute(expected));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_sub_si64() {
+        let a = 1i64;
+        let b = 2i64;
+        let expected = -1i64;
+        let r = _mm_sub_si64(mem::transmute(a), mem::transmute(b));
+        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_cvtpi32_pd() {
+        let a = _mm_setr_pi32(1, 2);
+        let expected = _mm_setr_pd(1., 2.);
+        let r = _mm_cvtpi32_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_set_epi64() {
+        let r = _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(2, 1));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_set1_epi64() {
+        let r = _mm_set1_epi64(mem::transmute(1i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 1));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_setr_epi64() {
+        let r = _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_movepi64_pi64() {
+        let r = _mm_movepi64_pi64(_mm_setr_epi64x(5, 0));
+        assert_eq_m64(r, _mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_movpi64_epi64() {
+        let r = _mm_movpi64_epi64(_mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_cvtpd_pi32() {
+        let a = _mm_setr_pd(5., 0.);
+        let r = _mm_cvtpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(5, 0));
+    }
+
+    #[simd_test(enable = "sse2,mmx")]
+    unsafe fn test_mm_cvttpd_pi32() {
+        use std::{f64, i32};
+
+        let a = _mm_setr_pd(5., 0.);
+        let r = _mm_cvttpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(5, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(i32::MIN, i32::MIN));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
new file mode 100644
index 00000000000..394909763da
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -0,0 +1,255 @@
+//! Streaming SIMD Extensions 3 (SSE3)
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::{simd_shuffle2, simd_shuffle4};
+use core_arch::x86::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Alternatively add and subtract packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
+    addsubps(a, b)
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    addsubpd(a, b)
+}
+
+/// Horizontally add adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
+    haddpd(a, b)
+}
+
+/// Horizontally add adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
+    haddps(a, b)
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    hsubpd(a, b)
+}
+
+/// Horizontally add adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
+    hsubps(a, b)
+}
+
+/// Load 128-bits of integer data from unaligned memory.
+/// This intrinsic may perform better than `_mm_loadu_si128`
+/// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
+    mem::transmute(lddqu(mem_addr as *const _))
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
+    simd_shuffle2(a, a, [0, 0])
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory
+/// into both elements of return vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
+    simd_shuffle4(a, a, [1, 1, 3, 3])
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
+    simd_shuffle4(a, a, [0, 0, 2, 2])
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse3.addsub.ps"]
+    fn addsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.addsub.pd"]
+    fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.pd"]
+    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.ps"]
+    fn haddps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.hsub.pd"]
+    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hsub.ps"]
+    fn hsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.ldu.dq"]
+    fn lddqu(mem_addr: *const i8) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movedup_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let r = _mm_movedup_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movehdup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_movehdup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_moveldup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_moveldup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_loaddup_pd() {
+        let d = -5.0;
+        let r = _mm_loaddup_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
new file mode 100644
index 00000000000..e7d7ea837ab
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -0,0 +1,1938 @@
+//! Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+// SSE4 rounding constans
+/// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
+/// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
+/// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
+/// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
+/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
+/// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
+/// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NO_EXC: i32 = 0x08;
+/// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NINT: i32 = 0x00;
+/// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_FLOOR: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
+/// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CEIL: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
+/// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
+/// use MXCSR.RC and do not suppress exceptions; see
+/// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
+/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NEARBYINT: i32 = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`
+///
+/// The high bit of each corresponding mask byte determines the selection.
+/// If the high bit is set the element of `a` is selected. The element
+/// of `b` is selected otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
+    mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
+///
+/// The mask bits determine the selection. A clear bit selects the
+/// corresponding element of `a`, and a set bit the corresponding
+/// element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
+#[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pblendw(a, b, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8, call))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
+    blendvpd(a, b, mask)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
+    blendvps(a, b, mask)
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using control mask `imm2`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
+#[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
+    macro_rules! call {
+        ($imm2:expr) => {
+            blendpd(a, b, $imm2)
+        };
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using mask `imm4`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            blendps(a, b, $imm4)
+        };
+    }
+    constify_imm4!(imm4, call)
+}
+
+/// Extract a single-precision (32-bit) floating-point element from `a`,
+/// selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(extractps, imm8 = 0)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
+    mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
+}
+
+/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
+    let imm8 = (imm8 & 15) as u32;
+    simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
+}
+
+/// Extract an 32-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(extractps, imm8 = 1)
+)]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
+    let imm8 = (imm8 & 3) as u32;
+    simd_extract::<_, i32>(a.as_i32x4(), imm8)
+}
+
+/// Select a single value in `a` to store at some position in `b`,
+/// Then zero elements according to `imm8`.
+///
+/// `imm8` specifies which bits from operand `a` will be copied, which bits in
+/// the result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `a`:
+///     - `00`: Selects bits `[31:0]` from operand `a`.
+///     - `01`: Selects bits `[63:32]` from operand `a`.
+///     - `10`: Selects bits `[95:64]` from operand `a`.
+///     - `11`: Selects bits `[127:96]` from operand `a`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+/// from operand `a` are copied:
+///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+/// element is cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            insertps(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Return a copy of `a` with the 8-bit integer from `i` inserted at a
+/// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
+}
+
+/// Return a copy of `a` with the 32-bit integer from `i` inserted at a
+/// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` and return packed maximum
+/// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
+/// maximum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
+/// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed 8-bit integers in `a` and `b` and return packed minimum
+/// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
+/// minimum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
+/// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminud(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute(simd_cast::<_, i16x8>(a))
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
+/// 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i32x4();
+    let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute(simd_cast::<_, i16x8>(a))
+}
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zero extend packed unsigned 16-bit integers in `a`
+/// to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Zero extend packed unsigned 16-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zero extend packed unsigned 32-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u32x4();
+    let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Returns the dot product of two __m128d vectors.
+///
+/// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            dppd(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Returns the dot product of two __m128 vectors.
+///
+/// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            dpps(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// down to an integer value, and store the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
+    roundpd(a, _MM_FROUND_FLOOR)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// down to an integer value, and store the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
+    roundps(a, _MM_FROUND_FLOOR)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// down to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copy the upper element from `a` to the upper element of the intrinsic
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
+    roundsd(a, b, _MM_FROUND_FLOOR)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// down to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copy the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
+    roundss(a, b, _MM_FROUND_FLOOR)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// up to an integer value, and store the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
+    roundpd(a, _MM_FROUND_CEIL)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// up to an integer value, and store the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
+    roundps(a, _MM_FROUND_CEIL)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// up to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrisic result,
+/// and copy the upper element from `a` to the upper element
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
+    roundsd(a, b, _MM_FROUND_CEIL)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// up to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copy the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
+    roundss(a, b, _MM_FROUND_CEIL)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// using the `rounding` parameter, and store the results as packed
+/// double-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # extern crate std_detect as std;
+///
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            roundpd(a, $imm4)
+        };
+    }
+    constify_imm4!(rounding, call)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// using the `rounding` parameter, and store the results as packed
+/// single-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #![feature(stdsimd)]
+///
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # extern crate std_detect as std;
+///
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps, rounding = 0))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            roundps(a, $imm4)
+        };
+    }
+    constify_imm4!(rounding, call)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// using the `rounding` parameter, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copy the upper element from `a` to the upper element of the intrinsic
+/// result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # extern crate std_detect as std;
+///
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            roundsd(a, b, $imm4)
+        };
+    }
+    constify_imm4!(rounding, call)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// using the `rounding` parameter, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copy the upper 3 packed elements from `a` to the upper elements
+/// of the instrinsic result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # extern crate std_detect as std;
+///
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss, rounding = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            roundss(a, b, $imm4)
+        };
+    }
+    constify_imm4!(rounding, call)
+}
+
+/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
+/// returning a vector containing its value in its first position, and its
+/// index
+/// in its second position; all other elements are set to zero.
+///
+/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
+/// instruction.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+///
+/// Returns:
+///
+/// A 128-bit value where:
+///
+/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
+/// * bits `[18:16]` - contain the index of the minimum value
+/// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
+    mem::transmute(phminposuw(a.as_u16x8()))
+}
+
+/// Multiply the low 32-bit integers from each packed 64-bit
+/// element in `a` and `b`, and return the signed 64-bit result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate
+/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
+/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
+/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
+/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
+/// return a negative number.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// The following algorithm is performed:
+///
+/// ```ignore
+/// i = imm8[2] * 4
+/// j = imm8[1:0] * 4
+/// for k := 0 to 7
+///     d0 = abs(a[i + k + 0] - b[j + 0])
+///     d1 = abs(a[i + k + 1] - b[j + 1])
+///     d2 = abs(a[i + k + 2] - b[j + 2])
+///     d3 = abs(a[i + k + 3] - b[j + 3])
+///     r[k] = d0 + d1 + d2 + d3
+/// ```
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+/// * `b` - A 128-bit vector of type `__m128i`.
+/// * `imm8` - An 8-bit immediate operand specifying how the absolute
+///   differences are to be calculated
+///     * Bit `[2]` specify the offset for operand `a`
+///     * Bits `[1:0]` specify the offset for operand `b`
+///
+/// Returns:
+///
+/// * A `__m128i` vector containing the sums of the sets of   absolute
+///   differences between both operands.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            mpsadbw(a, b, $imm8)
+        };
+    }
+    mem::transmute(constify_imm3!(imm8, call))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestz(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestnzc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testz_si128(a, mask)
+}
+
+/// Tests whether the specified bits in `a` 128-bit integer vector are all
+/// ones.
+///
+/// Argument:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+///
+/// Returns:
+///
+/// * `1` - if the bits specified in the operand are all set to 1,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testnzc_si128(a, mask)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse41.pblendvb"]
+    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.blendvpd"]
+    fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse41.blendvps"]
+    fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse41.blendpd"]
+    fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.blendps"]
+    fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.pblendw"]
+    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.pmaxsb"]
+    fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pmaxuw"]
+    fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmaxsd"]
+    fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pmaxud"]
+    fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
+    #[link_name = "llvm.x86.sse41.pminsb"]
+    fn pminsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pminuw"]
+    fn pminuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pminsd"]
+    fn pminsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pminud"]
+    fn pminud(a: u32x4, b: u32x4) -> u32x4;
+    #[link_name = "llvm.x86.sse41.packusdw"]
+    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
+    #[link_name = "llvm.x86.sse41.dppd"]
+    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.dpps"]
+    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.pd"]
+    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ps"]
+    fn roundps(a: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.sd"]
+    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ss"]
+    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.phminposuw"]
+    fn phminposuw(a: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmuldq"]
+    fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
+    #[link_name = "llvm.x86.sse41.mpsadbw"]
+    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.ptestz"]
+    fn ptestz(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestc"]
+    fn ptestc(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestnzc"]
+    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let mask = _mm_setr_epi8(
+            0, -1, 0, -1, 0, -1, 0, -1,
+            0, -1, 0, -1, 0, -1, 0, -1,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
+        );
+        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let mask = mem::transmute(_mm_setr_epi64x(0, -1));
+        let r = _mm_blendv_pd(a, b, mask);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let mask = mem::transmute(_mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_blendv_ps(a, b, mask);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let r = _mm_blend_pd(a, b, 0b10);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let r = _mm_blend_ps(a, b, 0b1010);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_blend_epi16(a, b, 0b1010_1100);
+        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_ps() {
+        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = mem::transmute(_mm_extract_ps(a, 1));
+        assert_eq!(r, 1.0);
+        let r: f32 = mem::transmute(_mm_extract_ps(a, 5));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+        );
+        let r1 = _mm_extract_epi8(a, 0);
+        let r2 = _mm_extract_epi8(a, 19);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm_extract_epi32(a, 1);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi32(a, 5);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_ps() {
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps(a, b, 0b11_00_1100);
+        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi8() {
+        let a = _mm_set1_epi8(0);
+        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_insert_epi8(a, 32, 1);
+        assert_eq_m128i(r, e);
+        let r = _mm_insert_epi8(a, 32, 17);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi32() {
+        let a = _mm_set1_epi32(0);
+        let e = _mm_setr_epi32(0, 32, 0, 0);
+        let r = _mm_insert_epi32(a, 32, 1);
+        assert_eq_m128i(r, e);
+        let r = _mm_insert_epi32(a, 32, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30, 32,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_max_epu16(a, b);
+        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epi32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epu32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8_1() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8_2() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, -4, -5, 8, -9, -12, 13, -16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, -3, -6, 7, -10, -11, 14, -15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, -4, -6, 7, -10, -12, 13, -16,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_min_epu16(a, b);
+        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32_1() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32_2() {
+        let a = _mm_setr_epi32(-1, 4, 5, -7);
+        let b = _mm_setr_epi32(-2, 3, -6, 8);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(-2, 3, -6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epu32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_packus_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(-1, -2, -3, -4);
+        let r = _mm_packus_epi32(a, b);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cmpeq_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(0, 0);
+        let r = _mm_cmpeq_epi64(a, b);
+        let e = _mm_setr_epi64x(-1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi32(-10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepu32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_pd() {
+        let a = _mm_setr_pd(2.0, 3.0);
+        let b = _mm_setr_pd(1.0, 4.0);
+        let e = _mm_setr_pd(14.0, 0.0);
+        assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_ps() {
+        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
+        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
+        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
+        assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_pd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let r = _mm_floor_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ps() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let r = _mm_floor_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_sd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let b = _mm_setr_pd(-1.5, -3.5);
+        let r = _mm_floor_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 4.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ss() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
+        let r = _mm_floor_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_pd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let r = _mm_ceil_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ps() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let r = _mm_ceil_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_ceil_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
+        let r = _mm_ceil_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_pd() {
+        let a = _mm_setr_pd(1.25, 3.75);
+        let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+        let e = _mm_setr_pd(1.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ps() {
+        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
+        let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
+        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+        let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+        let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_1() {
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_2() {
+        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mul_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(1, 3);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
+            let b = _mm_setr_epi32(
+                -20, -256, /* ignored */
+                666666, 666666, /* ignored */
+            );
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(-300, 823043843622);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mullo_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mullo_epi32(a, b);
+            let e = _mm_setr_epi32(1, 2, 3, 4);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
+            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
+            let r = _mm_mullo_epi32(a, b);
+            // Attention, most significant bit in r[2] is treated
+            // as a sign bit:
+            // 1234567 * 666666 = -1589877210
+            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16() {
+        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mpsadbw_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+
+        let r = _mm_mpsadbw_epu8(a, a, 0b000);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8(a, a, 0b001);
+        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8(a, a, 0b100);
+        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8(a, a, 0b101);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8(a, a, 0b111);
+        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testz_si128() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testc_si128() {
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testnzc_si128() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_zeros() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_ones() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_mix_ones_zeros() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse42.rs b/library/stdarch/crates/core_arch/src/x86/sse42.rs
new file mode 100644
index 00000000000..c32568fefcc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse42.rs
@@ -0,0 +1,941 @@
+//! Streaming SIMD Extensions 4.2 (SSE4.2)
+//!
+//! Extends SSE4.1 with STTNI (String and Text New Instructions).
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+
+/// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
+/// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
+
+/// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
+/// For each character in `a`, determine if
+/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
+/// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
+/// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
+
+/// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
+/// Negate results
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
+/// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
+/// Negate results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
+
+/// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
+/// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
+
+/// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
+/// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistrm128(a, b, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8, call))
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8` and return the generated index. Similar to
+/// [`_mm_cmpestri`] with the exception that [`_mm_cmpestri`] requires the
+/// lengths of `a` and `b` to be explicitly specified.
+///
+/// # Control modes
+///
+/// The control specified by `imm8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// Find a substring using [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ```
+/// # #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// # #[cfg(not(dox))]
+/// # use real_std::prelude::v1::*;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// let haystack = b"This is a long string of text data\r\n\tthat extends
+/// multiple lines";
+/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
+///
+/// let a = _mm_loadu_si128(needle.as_ptr() as *const _);
+/// let hop = 16;
+/// let mut indexes = Vec::new();
+///
+/// // Chunk the haystack into 16 byte chunks and find
+/// // the first "\r\n\t" in the chunk.
+/// for (i, chunk) in haystack.chunks(hop).enumerate() {
+///     let b = _mm_loadu_si128(chunk.as_ptr() as *const _);
+///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
+///     if idx != 16 {
+///         indexes.push((idx as usize) + (i * hop));
+///     }
+/// }
+/// assert_eq!(indexes, vec![34]);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// The `_mm_cmpistri` intrinsic may also be used to find the existance of
+/// one or more of a given set of characters in the haystack.
+///
+/// ```
+/// # #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// // Ensure your input is 16 byte aligned
+/// let password = b"hunter2\0\0\0\0\0\0\0\0\0";
+/// let special_chars = b"!@#$%^&*()[]:;<>";
+///
+/// // Load the input
+/// let a = _mm_loadu_si128(special_chars.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(password.as_ptr() as *const _);
+///
+/// // Use _SIDD_CMP_EQUAL_ANY to find the index of any bytes in b
+/// let idx = _mm_cmpistri(a.into(), b.into(), _SIDD_CMP_EQUAL_ANY);
+///
+/// if idx < 16 {
+///     println!("Congrats! Your password contains a special character");
+///     # panic!("{:?} does not contain a special character", password);
+/// } else {
+///     println!("Your password should contain a special character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Find the index of the first character in the haystack that is within a
+/// range of characters.
+///
+/// ```
+/// # #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let b = b":;<=>?@[\\]^_`abc";
+/// # let b = _mm_loadu_si128(b.as_ptr() as *const _);
+///
+/// // Specify the ranges of values to be searched for [A-Za-z0-9].
+/// let a = b"AZaz09\0\0\0\0\0\0\0\0\0\0";
+/// let a = _mm_loadu_si128(a.as_ptr() as *const _);
+///
+/// // Use _SIDD_CMP_RANGES to find the index of first byte in ranges.
+/// // Which in this case will be the first alpha numeric byte found
+/// // in the string.
+/// let idx = _mm_cmpistri(a, b, _SIDD_CMP_RANGES);
+///
+/// if idx < 16 {
+///     println!("Found an alpha numeric character");
+///     # assert_eq!(idx, 13);
+/// } else {
+///     println!("Did not find an alpha numeric character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Working with 16-bit characters.
+///
+/// ```
+/// # #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let mut some_utf16_words = [0u16; 8];
+/// # let mut more_utf16_words = [0u16; 8];
+/// # '❤'.encode_utf16(&mut some_utf16_words);
+/// # '𝕊'.encode_utf16(&mut more_utf16_words);
+/// // Load the input
+/// let a = _mm_loadu_si128(some_utf16_words.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(more_utf16_words.as_ptr() as *const _);
+///
+/// // Specify _SIDD_UWORD_OPS to compare words instead of bytes, and
+/// // use _SIDD_CMP_EQUAL_EACH to compare the two strings.
+/// let idx = _mm_cmpistri(a, b, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH);
+///
+/// if idx == 0 {
+///     println!("16-bit unicode strings were equal!");
+///     # panic!("Strings should not be equal!")
+/// } else {
+///     println!("16-bit unicode strings were not equal!");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
+/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
+/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
+/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
+/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
+/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
+/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
+/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
+/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
+/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
+/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
+/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
+/// [`_mm_cmpestri`]: fn._mm_cmpestri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistri128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and return `1` if any character in `b` was null.
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistriz128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and return `1` if the resulting mask was non-zero,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistric128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and returns `1` if any character in `a` was null,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistris128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistrio128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the
+/// control in `imm8`, and return `1` if `b` did not contain a null
+/// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpistria128(a, b, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrm(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> __m128i {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestrm128(a, la, b, lb, $imm8)
+        };
+    }
+    mem::transmute(constify_imm8!(imm8, call))
+}
+
+/// Compare packed strings `a` and `b` with lengths `la` and `lb` using the
+/// control in `imm8` and return the generated index. Similar to
+/// [`_mm_cmpistri`] with the exception that [`_mm_cmpistri`] implicitly
+/// determines the length of `a` and `b`.
+///
+/// # Control modes
+///
+/// The control specified by `imm8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// ```
+/// # #![feature(stdsimd)]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate std_detect as std;
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+///
+/// // The string we want to find a substring in
+/// let haystack = b"Split \r\n\t line  ";
+///
+/// // The string we want to search for with some
+/// // extra bytes we do not want to search for.
+/// let needle = b"\r\n\t ignore this ";
+///
+/// let a = _mm_loadu_si128(needle.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(haystack.as_ptr() as *const _);
+///
+/// // Note: We explicitly specify we only want to search `b` for the
+/// // first 3 characters of a.
+/// let idx = _mm_cmpestri(a, 3, b, 15, _SIDD_CMP_EQUAL_ORDERED);
+///
+/// assert_eq!(idx, 6);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
+/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
+/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
+/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
+/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
+/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
+/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
+/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
+/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
+/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
+/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
+/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
+/// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestri128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return `1` if any character in
+/// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrz(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestriz128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return `1` if the resulting mask
+/// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrc(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestric128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return `1` if any character in
+/// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrs(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestris128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return bit `0` of the resulting
+/// bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestro(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestrio128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `imm8`, and return `1` if `b` did not
+/// contain a null character and the resulting mask was zero, and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
+#[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestra(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32) -> i32 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            pcmpestria128(a, la, b, lb, $imm8)
+        };
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32 value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u8)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
+    crc32_32_8(crc, v)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32 value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u16)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
+    crc32_32_16(crc, v)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32 value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u32)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
+    crc32_32_32(crc, v)
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for greater-than,
+/// return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    // SSE 4.2 string and text comparison ops
+    #[link_name = "llvm.x86.sse42.pcmpestrm128"]
+    fn pcmpestrm128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> u8x16;
+    #[link_name = "llvm.x86.sse42.pcmpestri128"]
+    fn pcmpestri128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestriz128"]
+    fn pcmpestriz128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestric128"]
+    fn pcmpestric128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestris128"]
+    fn pcmpestris128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestrio128"]
+    fn pcmpestrio128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestria128"]
+    fn pcmpestria128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrm128"]
+    fn pcmpistrm128(a: i8x16, b: i8x16, imm8: i8) -> i8x16;
+    #[link_name = "llvm.x86.sse42.pcmpistri128"]
+    fn pcmpistri128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistriz128"]
+    fn pcmpistriz128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistric128"]
+    fn pcmpistric128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistris128"]
+    fn pcmpistris128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrio128"]
+    fn pcmpistrio128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistria128"]
+    fn pcmpistria128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    // SSE 4.2 CRC instructions
+    #[link_name = "llvm.x86.sse42.crc32.32.8"]
+    fn crc32_32_8(crc: u32, v: u8) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.16"]
+    fn crc32_32_16(crc: u32, v: u16) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.32"]
+    fn crc32_32_32(crc: u32, v: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+    use std::ptr;
+
+    // Currently one cannot `load` a &[u8] that is is less than 16
+    // in length. This makes loading strings less than 16 in length
+    // a bit difficult. Rather than `load` and mutate the __m128i,
+    // it is easier to memcpy the given string to a local slice with
+    // length 16 and `load` the local slice.
+    #[target_feature(enable = "sse4.2")]
+    unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
+        assert!(s.len() <= 16);
+        let slice = &mut [0u8; 16];
+        ptr::copy_nonoverlapping(
+            s.get_unchecked(0) as *const u8 as *const u8,
+            slice.get_unchecked_mut(0) as *mut u8 as *mut u8,
+            s.len(),
+        );
+        _mm_loadu_si128(slice.as_ptr() as *const _)
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrm() {
+        let a = str_to_m128i(b"Hello! Good-Bye!");
+        let b = str_to_m128i(b"hello! good-bye!");
+        let i = _mm_cmpistrm(a, b, _SIDD_UNIT_MASK);
+        #[rustfmt::skip]
+        let res = _mm_setr_epi8(
+            0x00, !0, !0, !0, !0, !0, !0, 0x00,
+            !0, !0, !0, !0, 0x00, !0, !0, !0,
+        );
+        assert_eq_m128i(i, res);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistri() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"   Hello        ");
+        let i = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpistrz(a, b, _SIDD_CMP_EQUAL_ORDERED);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrc() {
+        let a = str_to_m128i(b"                ");
+        let b = str_to_m128i(b"       !        ");
+        let i = _mm_cmpistrc(a, b, _SIDD_UNIT_MASK);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrs() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"");
+        let i = _mm_cmpistrs(a, b, _SIDD_CMP_EQUAL_ORDERED);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistro() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        #[rustfmt::skip]
+        let b_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = b_bytes;
+        let i = _mm_cmpistro(a, b, _SIDD_UWORD_OPS | _SIDD_UNIT_MASK);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistra() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello!!!!!!!!!!!");
+        let i = _mm_cmpistra(a, b, _SIDD_UNIT_MASK);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrm() {
+        let a = str_to_m128i(b"Hello!");
+        let b = str_to_m128i(b"Hello.");
+        let i = _mm_cmpestrm(a, 5, b, 5, _SIDD_UNIT_MASK);
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            !0, !0, !0, !0, !0, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        );
+        assert_eq_m128i(i, r);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestri() {
+        let a = str_to_m128i(b"bar - garbage");
+        let b = str_to_m128i(b"foobar");
+        let i = _mm_cmpestri(a, 3, b, 6, _SIDD_CMP_EQUAL_ORDERED);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpestrz(a, 16, b, 6, _SIDD_CMP_EQUAL_ORDERED);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrc() {
+        let va = str_to_m128i(b"!!!!!!!!");
+        let vb = str_to_m128i(b"        ");
+        let i = _mm_cmpestrc(va, 7, vb, 7, _SIDD_UNIT_MASK);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrs() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = _mm_set1_epi8(0x00);
+        let i = _mm_cmpestrs(a, 8, b, 0, _SIDD_UWORD_OPS);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestro() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"World");
+        let i = _mm_cmpestro(a, 5, b, 5, _SIDD_UBYTE_OPS);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestra() {
+        let a = str_to_m128i(b"Cannot match a");
+        let b = str_to_m128i(b"Null after 14");
+        let i = _mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u8() {
+        let crc = 0x2aa1e72b;
+        let v = 0x2a;
+        let i = _mm_crc32_u8(crc, v);
+        assert_eq!(i, 0xf24122e4);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u16() {
+        let crc = 0x8ecec3b5;
+        let v = 0x22b;
+        let i = _mm_crc32_u16(crc, v);
+        assert_eq!(i, 0x13bb2fb);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u32() {
+        let crc = 0xae2912c8;
+        let v = 0x845fed;
+        let i = _mm_crc32_u32(crc, v);
+        assert_eq!(i, 0xffae2ed1);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpgt_epi64() {
+        let a = _mm_setr_epi64x(0, 0x2a);
+        let b = _mm_set1_epi64x(0x00);
+        let i = _mm_cmpgt_epi64(a, b);
+        assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse4a.rs b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
new file mode 100644
index 00000000000..ac172f9b040
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
@@ -0,0 +1,159 @@
+//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
+
+use core_arch::simd::*;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse4a.extrq"]
+    fn extrq(x: i64x2, y: i8x16) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertq"]
+    fn insertq(x: i64x2, y: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.movnt.sd"]
+    fn movntsd(x: *mut f64, y: __m128d);
+    #[link_name = "llvm.x86.sse4a.movnt.ss"]
+    fn movntss(x: *mut f32, y: __m128);
+}
+
+// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
+// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ
+
+/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
+///
+/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
+/// other bits are ignored.
+///
+/// If the length is zero, it is interpreted as `64`. If the length and index
+/// are zero, the lower 64 bits of `x` are extracted.
+///
+/// If `length == 0 && index > 0` or `lenght + index > 64` the result is
+/// undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
+    mem::transmute(extrq(x.as_i64x2(), y.as_i8x16()))
+}
+
+/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
+///
+/// The bits of `y`:
+///
+/// - `[69:64]` specify the `length`,
+/// - `[77:72]` specify the index.
+///
+/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
+/// or `index > 0 && length == 0` the result is undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
+    mem::transmute(insertq(x.as_i64x2(), y.as_i64x2()))
+}
+
+/// Non-temporal store of `a.0` into `p`.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
+    movntsd(p, a);
+}
+
+/// Non-temporal store of `a.0` into `p`.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
+    movntss(p, a);
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extract_si64() {
+        let b = 0b0110_0000_0000_i64;
+        //        ^^^^ bit range extracted
+        let x = _mm_setr_epi64x(b, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(v, 0);
+        let e = _mm_setr_epi64x(0b0110_i64, 0);
+        let r = _mm_extract_si64(x, y);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_insert_si64() {
+        let i = 0b0110_i64;
+        //        ^^^^ bit range inserted
+        let z = 0b1010_1010_1010i64;
+        //        ^^^^ bit range replaced
+        let e = 0b0110_1010_1010i64;
+        //        ^^^^ replaced 1010 with 0110
+        let x = _mm_setr_epi64x(z, 0);
+        let expected = _mm_setr_epi64x(e, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(i, v);
+        let r = _mm_insert_si64(x, y);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF64 {
+        data: [f64; 2],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_stream_sd() {
+        let mut mem = MemoryF64 {
+            data: [1.0_f64, 2.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_pd(3.0, 4.0);
+
+            _mm_stream_sd(d, x);
+        }
+        assert_eq!(mem.data[0], 3.0);
+        assert_eq!(mem.data[1], 2.0);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF32 {
+        data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_stream_ss() {
+        let mut mem = MemoryF32 {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+            _mm_stream_ss(d, x);
+        }
+        assert_eq!(mem.data[0], 5.0);
+        assert_eq!(mem.data[1], 2.0);
+        assert_eq!(mem.data[2], 3.0);
+        assert_eq!(mem.data[3], 4.0);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/ssse3.rs b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
new file mode 100644
index 00000000000..a013ab6551a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@@ -0,0 +1,898 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
+use core_arch::simd::*;
+use core_arch::simd_llvm::simd_shuffle16;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Compute the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    mem::transmute(pabsb128(a.as_i8x16()))
+}
+
+/// Compute the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    mem::transmute(pabsw128(a.as_i16x8()))
+}
+
+/// Compute the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    mem::transmute(pabsd128(a.as_i32x4()))
+}
+
+/// Shuffle bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and return the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(palignr, n = 15))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
+    let n = n as u32;
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if n > 32 {
+        return _mm_set1_epi8(0);
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b, n) = if n > 16 {
+        (_mm_set1_epi8(0), a, n - 16)
+    } else {
+        (a, b, n)
+    };
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+
+    macro_rules! shuffle {
+        ($shift:expr) => {
+            simd_shuffle16(
+                b,
+                a,
+                [
+                    0 + $shift,
+                    1 + $shift,
+                    2 + $shift,
+                    3 + $shift,
+                    4 + $shift,
+                    5 + $shift,
+                    6 + $shift,
+                    7 + $shift,
+                    8 + $shift,
+                    9 + $shift,
+                    10 + $shift,
+                    11 + $shift,
+                    12 + $shift,
+                    13 + $shift,
+                    14 + $shift,
+                    15 + $shift,
+                ],
+            )
+        };
+    }
+    let r: i8x16 = match n {
+        0 => shuffle!(0),
+        1 => shuffle!(1),
+        2 => shuffle!(2),
+        3 => shuffle!(3),
+        4 => shuffle!(4),
+        5 => shuffle!(5),
+        6 => shuffle!(6),
+        7 => shuffle!(7),
+        8 => shuffle!(8),
+        9 => shuffle!(9),
+        10 => shuffle!(10),
+        11 => shuffle!(11),
+        12 => shuffle!(12),
+        13 => shuffle!(13),
+        14 => shuffle!(14),
+        15 => shuffle!(15),
+        _ => shuffle!(16),
+    };
+    mem::transmute(r)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
+}
+
+/// Multiply packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and return the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and return the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compute the absolute value of packed 8-bit integers in `a` and
+/// return the unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsb))]
+pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
+    pabsb(a)
+}
+
+/// Compute the absolute value of packed 8-bit integers in `a`, and return the
+/// unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsw))]
+pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
+    pabsw(a)
+}
+
+/// Compute the absolute value of packed 32-bit integers in `a`, and return the
+/// unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsd))]
+pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
+    pabsd(a)
+}
+
+/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
+/// the corresponding 8-bit element of `b`, and return the results
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pshufb))]
+pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
+    pshufb(a, b)
+}
+
+/// Concatenates the two 64-bit integer vector operands, and right-shifts
+/// the result by the number of bytes specified in the immediate operand.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(palignr, n = 15))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            palignrb(a, b, $imm8)
+        };
+    }
+    constify_imm8!(n, call)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of `[4 x i16]`.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddw))]
+pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddw(a, b)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of `[2 x i32]`.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddd))]
+pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
+    phaddd(a, b)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of `[4 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddsw))]
+pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddsw(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of `[4 x i16]`.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubw))]
+pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubw(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of `[2 x i32]`.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubd))]
+pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
+    phsubd(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of `[4 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubsw))]
+pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubsw(a, b)
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, adds pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pmaddubsw))]
+pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaddubsw(a, b)
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// products to the 18 most significant bits by right-shifting, rounds the
+/// truncated value by adding 1, and writes bits `[16:1]` to the destination.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pmulhrsw))]
+pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmulhrsw(a, b)
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignb))]
+pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
+    psignb(a, b)
+}
+
+/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignw))]
+pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
+    psignw(a, b)
+}
+
+/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignd))]
+pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
+    psignd(a, b)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.ssse3.pabs.b.128"]
+    fn pabsb128(a: i8x16) -> u8x16;
+
+    #[link_name = "llvm.x86.ssse3.pabs.w.128"]
+    fn pabsw128(a: i16x8) -> u16x8;
+
+    #[link_name = "llvm.x86.ssse3.pabs.d.128"]
+    fn pabsd128(a: i32x4) -> u32x4;
+
+    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
+    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
+
+    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
+    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
+    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
+    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
+    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
+    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
+    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
+    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"]
+    fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.b.128"]
+    fn psignb128(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.ssse3.psign.w.128"]
+    fn psignw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.d.128"]
+    fn psignd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.pabs.b"]
+    fn pabsb(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pabs.w"]
+    fn pabsw(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pabs.d"]
+    fn pabsd(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pshuf.b"]
+    fn pshufb(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.mmx.palignr.b"]
+    fn palignrb(a: __m64, b: __m64, n: u8) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.w"]
+    fn phaddw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.d"]
+    fn phaddd(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.sw"]
+    fn phaddsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.w"]
+    fn phsubw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.d"]
+    fn phsubd(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.sw"]
+    fn phsubsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw"]
+    fn pmaddubsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pmul.hr.sw"]
+    fn pmulhrsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.b"]
+    fn psignb(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.w"]
+    fn psignw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.d"]
+    fn psignd(a: __m64, b: __m64) -> __m64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi8() {
+        let r = _mm_abs_epi8(_mm_set1_epi8(-5));
+        assert_eq_m128i(r, _mm_set1_epi8(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi16() {
+        let r = _mm_abs_epi16(_mm_set1_epi16(-5));
+        assert_eq_m128i(r, _mm_set1_epi16(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi32() {
+        let r = _mm_abs_epi32(_mm_set1_epi32(-5));
+        assert_eq_m128i(r, _mm_set1_epi32(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 128_u8 as i8, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
+        let r = _mm_shuffle_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let r = _mm_alignr_epi8(a, b, 33);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        let r = _mm_alignr_epi8(a, b, 17);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8(a, b, 16);
+        assert_eq_m128i(r, a);
+
+        let r = _mm_alignr_epi8(a, b, 15);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8(a, b, 0);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
+        let r = _mm_hadd_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadds_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
+        let r = _mm_hadds_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(3, 7, 132, 7);
+        let r = _mm_hadd_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
+        let r = _mm_hsub_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsubs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
+        let r = _mm_hsubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(-1, -1, -124, 1);
+        let r = _mm_hsub_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_maddubs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
+        let r = _mm_maddubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_mulhrs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
+        let r = _mm_mulhrs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, -14, -15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, -4, 3, 24, 12, -6, -19,
+            12, 5, -5, 10, 4, 1, -8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            1, 2, -3, 4, 5, 6, -7, -8,
+            9, 10, -11, 12, 13, -14, 15, 0,
+        );
+        let r = _mm_sign_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi32() {
+        let a = _mm_setr_epi32(-1, 2, 3, 4);
+        let b = _mm_setr_epi32(1, -1, 1, 0);
+        let expected = _mm_setr_epi32(-1, -2, 3, 0);
+        let r = _mm_sign_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_abs_pi8() {
+        let r = _mm_abs_pi8(_mm_set1_pi8(-5));
+        assert_eq_m64(r, _mm_set1_pi8(5));
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_abs_pi16() {
+        let r = _mm_abs_pi16(_mm_set1_pi16(-5));
+        assert_eq_m64(r, _mm_set1_pi16(5));
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_abs_pi32() {
+        let r = _mm_abs_pi32(_mm_set1_pi32(-5));
+        assert_eq_m64(r, _mm_set1_pi32(5));
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_shuffle_pi8() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_pi8(4, 128u8 as i8, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_pi8(5, 0, 5, 4, 1, 5, 7, 4);
+        let r = _mm_shuffle_pi8(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_alignr_pi8() {
+        let a = _mm_setr_pi32(0x89ABCDEF_u32 as i32, 0x01234567_u32 as i32);
+        let b = _mm_setr_pi32(0xBBAA9988_u32 as i32, 0xFFDDEECC_u32 as i32);
+        let r = _mm_alignr_pi8(a, b, 4);
+        assert_eq_m64(r, ::std::mem::transmute(0x89abcdefffddeecc_u64));
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hadd_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(3, 7, 132, 7);
+        let r = _mm_hadd_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hadd_pi32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(4, 128);
+        let expected = _mm_setr_pi32(3, 132);
+        let r = _mm_hadd_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hadds_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(32767, 1, -32768, -1);
+        let expected = _mm_setr_pi16(3, 7, 32767, -32768);
+        let r = _mm_hadds_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hsub_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(-1, -1, -124, 1);
+        let r = _mm_hsub_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hsub_pi32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(4, 128);
+        let expected = _mm_setr_pi32(-1, -124);
+        let r = _mm_hsub_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_hsubs_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(-1, -1, -124, 1);
+        let r = _mm_hsubs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_maddubs_pi16() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_pi8(4, 63, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_pi16(130, 24, 192, 194);
+        let r = _mm_maddubs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_mulhrs_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 32767, -1, -32768);
+        let expected = _mm_setr_pi16(0, 2, 0, -4);
+        let r = _mm_mulhrs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_sign_pi8() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_pi8(4, 64, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_pi8(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_pi8(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_sign_pi16() {
+        let a = _mm_setr_pi16(-1, 2, 3, 4);
+        let b = _mm_setr_pi16(1, -1, 1, 0);
+        let expected = _mm_setr_pi16(-1, -2, 3, 0);
+        let r = _mm_sign_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3,mmx")]
+    unsafe fn test_mm_sign_pi32() {
+        let a = _mm_setr_pi32(-1, 2);
+        let b = _mm_setr_pi32(1, 0);
+        let expected = _mm_setr_pi32(-1, 0);
+        let r = _mm_sign_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/tbm.rs b/library/stdarch/crates/core_arch/src/x86/tbm.rs
new file mode 100644
index 00000000000..314c5e36c7f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/tbm.rs
@@ -0,0 +1,460 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
+//! instructions.
+//!
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+// FIXME(blocked on #248)
+// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
+// intrinsic %llvm.x86.tbm.bextri.u32
+/*
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.tbm.bextri.u32"]
+    fn x86_tbm_bextri_u32(a: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.tbm.bextri.u64"]
+    fn x86_tbm_bextri_u64(x: u64, y: u64) -> u64;
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_tbm_bextri_u32(a, control) }
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    unsafe { x86_tbm_bextri_u64(a, control) }
+}
+*/
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
+    x & (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u64(x: u64) -> u64 {
+    x & (x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u32(x: u32) -> u32 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u64(x: u64) -> u64 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u64(x: u64) -> u64 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
+    x | (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u64(x: u64) -> u64 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u64(x: u64) -> u64 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u64(x: u64) -> u64 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is 0, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is 0, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
+    !x & (x.wrapping_sub(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
+    !x & (x.wrapping_sub(1))
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    /*
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextr_u32() {
+        assert_eq!(_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextr_u64() {
+        assert_eq!(_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+    }
+    */
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcfill_u32() {
+        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcfill_u64() {
+        assert_eq!(_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
+        assert_eq!(_blcfill_u64(0b1111_1111u64), 0u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blci_u32() {
+        assert_eq!(
+            _blci_u32(0b0101_0000u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32
+        );
+        assert_eq!(
+            _blci_u32(0b1111_1111u32),
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blci_u64() {
+        assert_eq!(
+            _blci_u64(0b0101_0000u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
+        );
+        assert_eq!(
+            _blci_u64(0b1111_1111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcic_u32() {
+        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcic_u64() {
+        assert_eq!(_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
+        assert_eq!(_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcmsk_u32() {
+        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcmsk_u64() {
+        assert_eq!(_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
+        assert_eq!(_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcs_u32() {
+        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcs_u64() {
+        assert_eq!(_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
+        assert_eq!(_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsfill_u32() {
+        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+        assert_eq!(
+            _blsfill_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blsfill_u64() {
+        assert_eq!(_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
+        assert_eq!(
+            _blsfill_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsic_u32() {
+        assert_eq!(
+            _blsic_u32(0b0101_0100u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32
+        );
+        assert_eq!(
+            _blsic_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blsic_u64() {
+        assert_eq!(
+            _blsic_u64(0b0101_0100u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
+        );
+        assert_eq!(
+            _blsic_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_t1mskc_u32() {
+        assert_eq!(
+            _t1mskc_u32(0b0101_0111u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1000u32
+        );
+        assert_eq!(
+            _t1mskc_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_t1mksc_u64() {
+        assert_eq!(
+            _t1mskc_u64(0b0101_0111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
+        );
+        assert_eq!(
+            _t1mskc_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_tzmsk_u32() {
+        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_tzmsk_u64() {
+        assert_eq!(_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
+        assert_eq!(_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
new file mode 100644
index 00000000000..9dbe8c11ef6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -0,0 +1,145 @@
+//! Utilities used in testing the x86 intrinsics
+
+use core_arch::x86::*;
+
+#[target_feature(enable = "mmx")]
+pub unsafe fn assert_eq_m64(a: __m64, b: __m64) {
+    union A {
+        a: __m64,
+        b: u64,
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    union A {
+        a: __m128i,
+        b: [u64; 2],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
+    union A {
+        a: __m128d,
+        b: [f64; 2],
+    };
+    A { a }.b[idx]
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+    union A {
+        a: __m128,
+        b: [f32; 4],
+    };
+    A { a }.b[idx]
+}
+
+// not actually an intrinsic but useful in various tests as we proted from
+// `i64x2::new` which is backwards from `_mm_set_epi64x`
+#[target_feature(enable = "sse2")]
+pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    union A {
+        a: __m256i,
+        b: [u64; 4],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
+    union A {
+        a: __m256d,
+        b: [f64; 4],
+    };
+    A { a }.b[idx]
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
+    union A {
+        a: __m256,
+        b: [f32; 8],
+    };
+    A { a }.b[idx]
+}
+
+// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
+// which doesn't exist on x86!
+#[cfg(target_arch = "x86")]
+mod x86_polyfill {
+    use core_arch::x86::*;
+
+    pub unsafe fn _mm_insert_epi64(a: __m128i, val: i64, idx: i32) -> __m128i {
+        union A {
+            a: __m128i,
+            b: [i64; 2],
+        };
+        let mut a = A { a };
+        a.b[idx as usize] = val;
+        a.a
+    }
+
+    #[target_feature(enable = "avx2")]
+    pub unsafe fn _mm256_insert_epi64(a: __m256i, val: i64, idx: i32) -> __m256i {
+        union A {
+            a: __m256i,
+            b: [i64; 4],
+        };
+        let mut a = A { a };
+        a.b[idx as usize] = val;
+        a.a
+    }
+}
+#[cfg(target_arch = "x86_64")]
+mod x86_polyfill {
+    pub use core_arch::x86_64::{_mm256_insert_epi64, _mm_insert_epi64};
+}
+pub use self::x86_polyfill::*;
+
+pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
+    union A {
+        a: __m512i,
+        b: [i32; 16],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/xsave.rs b/library/stdarch/crates/core_arch/src/x86/xsave.rs
new file mode 100644
index 00000000000..c52dcd8c2a0
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/xsave.rs
@@ -0,0 +1,285 @@
+//! `i586`'s `xsave` and `xsaveopt` target feature intrinsics
+
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.xsave"]
+    fn xsave(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xrstor"]
+    fn xrstor(p: *const u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsetbv"]
+    fn xsetbv(v: u32, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsaveopt"]
+    fn xsaveopt(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsavec"]
+    fn xsavec(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsaves"]
+    fn xsaves(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xrstors"]
+    fn xrstors(p: *const u8, hi: u32, lo: u32) -> ();
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
+    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// `XFEATURE_ENABLED_MASK` for `XCR`
+///
+/// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+
+/// Copy 64-bits from `val` to the extended control register (`XCR`) specified
+/// by `a`.
+///
+/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsetbv(a: u32, val: u64) {
+    xsetbv(a, (val >> 32) as u32, val as u32);
+}
+
+/// Reads the contents of the extended control register `XCR`
+/// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xgetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
+    let eax: u32;
+    let edx: u32;
+    asm!("xgetbv" : "={eax}"(eax), "={edx}"(edx) : "{ecx}"(xcr_no));
+    ((edx as u64) << 32) | (eax as u64)
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
+    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fmt;
+    use std::prelude::v1::*;
+
+    use core_arch::x86::*;
+    use stdsimd_test::simd_test;
+
+    #[repr(align(64))]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<XsaveArea> for XsaveArea {
+        fn eq(&self, other: &XsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for XsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/209
+    /*
+    #[simd_test(enable = "xsave")]
+    unsafe fn xsave() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+
+    #[simd_test(enable = "xsave")]
+    unsafe fn xgetbv_xsetbv() {
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
+
+        let xcr: u64 = _xgetbv(xcr_n);
+        // FIXME: XSETBV is a privileged instruction we should only test this
+        // when running in privileged mode:
+        //
+        // _xsetbv(xcr_n, xcr);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
+        assert_eq!(xcr, xcr_cpy);
+    }
+
+    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/209
+    /*
+    #[simd_test(enable = "xsave,xsaveopt")]
+    unsafe fn xsaveopt() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+
+    // FIXME: this looks like a bug in Intel's SDE:
+    #[cfg(not(stdsimd_intel_sde))]
+    #[simd_test(enable = "xsave,xsavec")]
+    unsafe fn xsavec() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/209
+    /*
+    #[simd_test(enable = "xsave,xsaves")]
+    unsafe fn xsaves() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaves(a.ptr(), m);
+        _xrstors(a.ptr(), m);
+        _xsaves(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/abm.rs b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
new file mode 100644
index 00000000000..7a655dbe90f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//! System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u64)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt64)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _popcnt64(x: i64) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u64() {
+        assert_eq!(_lzcnt_u64(0b0101_1010), 57);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt64() {
+        assert_eq!(_popcnt64(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/adx.rs b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
new file mode 100644
index 00000000000..0343351b916
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
@@ -0,0 +1,46 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.u64"]
+    fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+    #[link_name = "llvm.x86.subborrow.u64"]
+    fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+}
+
+/// Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry flag), and store the unsigned 64-bit result in out, and the carry-out
+/// is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_addcarry_u64(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry or overflow flag), and store the unsigned 64-bit result in out, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+#[cfg(not(stage0))]
+pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    _addcarry_u64(c_in, a, b, out)
+}
+
+/// Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in
+/// (carry or overflow flag), and store the unsigned 64-bit result in out, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_subborrow_u64(c_in, a, b);
+    *out = b;
+    a
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx.rs b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
new file mode 100644
index 00000000000..429ee75c59a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
@@ -0,0 +1,46 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//! Programmer's Manual, Volume 3: General-Purpose and System
+//! Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use mem;
+
+/// Copy `a` to result, and insert the 64-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi64)
+#[inline]
+#[rustc_args_required_const(2)]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
+    mem::transmute(simd_insert(a.as_i64x4(), (index as u32) & 3, i))
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_insert_epi64(a, 0, 3);
+        let e = _mm256_setr_epi64x(1, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx2.rs b/library/stdarch/crates/core_arch/src/x86_64/avx2.rs
new file mode 100644
index 00000000000..a27f3125751
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx2.rs
@@ -0,0 +1,49 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+
+/// Extract a 64-bit integer from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[rustc_args_required_const(1)]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
+    let imm8 = (imm8 & 3) as u32;
+    simd_extract(a.as_i64x4(), imm8)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let r = _mm256_extract_epi64(a, 3);
+        assert_eq!(r, 3);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
new file mode 100644
index 00000000000..be3ced9e251
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
@@ -0,0 +1,184 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
+    _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    x86_bmi_bextr_64(a, control)
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
+    !a & b
+}
+
+/// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsi_u64(x: u64) -> u64 {
+    x & x.wrapping_neg()
+}
+
+/// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_sub(1_u64))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsr_u64(x: u64) -> u64 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
+    x.trailing_zeros() as u64
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
+    x.trailing_zeros() as i64
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+    use core_arch::x86_64::*;
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u64() {
+        let r = _bextr_u64(0b0101_0000u64, 4, 4);
+        assert_eq!(r, 0b0000_0101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u64() {
+        assert_eq!(_andn_u64(0, 0), 0);
+        assert_eq!(_andn_u64(0, 1), 1);
+        assert_eq!(_andn_u64(1, 0), 0);
+        assert_eq!(_andn_u64(1, 1), 0);
+
+        let r = _andn_u64(0b0000_0000u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0000_0000u64, 0b1111_1111u64);
+        assert_eq!(r, 0b1111_1111u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b1111_1111u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0100_0000u64, 0b0101_1101u64);
+        assert_eq!(r, 0b0001_1101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u64() {
+        assert_eq!(_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u64() {
+        let r = _blsmsk_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0001_1111u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u64() {
+        // TODO: test the behavior when the input is 0
+        let r = _blsr_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0010_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u64() {
+        assert_eq!(_tzcnt_u64(0b0000_0001u64), 0u64);
+        assert_eq!(_tzcnt_u64(0b0000_0000u64), 64u64);
+        assert_eq!(_tzcnt_u64(0b1001_0000u64), 4u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
new file mode 100644
index 00000000000..98d804ead2c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
@@ -0,0 +1,139 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u64)
+#[inline]
+#[cfg_attr(test, assert_instr(mulx))]
+#[target_feature(enable = "bmi2")]
+#[cfg(not(target_arch = "x86"))] // calls an intrinsic
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
+    let result: u128 = (a as u128) * (b as u128);
+    *hi = (result >> 64) as u64;
+    result as u64
+}
+
+/// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
+    x86_bmi2_bzhi_64(a, index as u64)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pdep_64(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pext_64(a, mask)
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.64"]
+    fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pdep.64"]
+    fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pext.64"]
+    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86_64::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0000_0011_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b0001_0111_0100_0011u64;
+
+        assert_eq!(_pext_u64(n, m0), s0);
+        assert_eq!(_pext_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0010_0000_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b1110_1001_0010_0011u64;
+
+        assert_eq!(_pdep_u64(n, m0), s0);
+        assert_eq!(_pdep_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u64() {
+        let n = 0b1111_0010u64;
+        let s = 0b0001_0010u64;
+        assert_eq!(_bzhi_u64(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    #[rustfmt::skip]
+    unsafe fn test_mulx_u64() {
+        let a: u64 = 9_223_372_036_854_775_800;
+        let b: u64 = 100;
+        let mut hi = 0;
+        let lo = _mulx_u64(a, b, &mut hi);
+        /*
+result = 922337203685477580000 =
+0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000
+  ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        */
+        assert_eq!(
+            lo,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64
+        );
+        assert_eq!(hi, 0b00110001u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bswap.rs b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
new file mode 100644
index 00000000000..75bb33c956c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
@@ -0,0 +1,35 @@
+//! Byte swap intrinsics.
+
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap64)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap64(x: i64) -> i64 {
+    bswap_i64(x)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.bswap.i64"]
+    fn bswap_i64(x: i64) -> i64;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap64() {
+        unsafe {
+            assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
+            assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
new file mode 100644
index 00000000000..c7b43a4469b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
@@ -0,0 +1,75 @@
+use sync::atomic::Ordering;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Compare and exchange 16 bytes (128 bits) of data atomically.
+///
+/// This intrinsic corresponds to the `cmpxchg16b` instruction on x86_64
+/// processors. It performs an atomic compare-and-swap, updating the `ptr`
+/// memory location to `val` if the current value in memory equals `old`.
+///
+/// # Return value
+///
+/// This function returns the previous value at the memory location. If it is
+/// equal to `old` then the memory was updated to `new`.
+///
+/// # Memory Orderings
+///
+/// This atomic operations has the same semantics of memory orderings as
+/// `AtomicUsize::compare_exchange` does, only operating on 16 bytes of memory
+/// instead of just a pointer.
+///
+/// For more information on memory orderings here see the `compare_exchange`
+/// documentation for other `Atomic*` types in the standard library.
+///
+/// # Unsafety
+///
+/// This method is unsafe because it takes a raw pointer and will attempt to
+/// read and possibly write the memory at the pointer. The pointer must also be
+/// aligned on a 16-byte boundary.
+///
+/// This method also requires the `cmpxchg16b` CPU feature to be available at
+/// runtime to work correctly. If the CPU running the binary does not actually
+/// support `cmpxchg16b` and the program enters an execution path that
+/// eventually would reach this function the behavior is undefined.
+///
+/// The `success` ordering must also be stronger or equal to `failure`, or this
+/// function call is undefined. See the `Atomic*` documentation's
+/// `compare_exchange` function for more information. When `compare_exchange`
+/// panics, this is undefined behavior. Currently this function aborts the
+/// process with an undefined instruction.
+#[inline]
+#[cfg_attr(test, assert_instr(cmpxchg16b, success = Ordering::SeqCst, failure = Ordering::SeqCst))]
+#[target_feature(enable = "cmpxchg16b")]
+#[cfg(not(stage0))]
+pub unsafe fn cmpxchg16b(
+    dst: *mut u128,
+    old: u128,
+    new: u128,
+    success: Ordering,
+    failure: Ordering,
+) -> u128 {
+    use intrinsics;
+    use sync::atomic::Ordering::*;
+
+    debug_assert!(dst as usize % 16 == 0);
+
+    let (val, _ok) = match (success, failure) {
+        (Acquire, Acquire) => intrinsics::atomic_cxchg_acq(dst, old, new),
+        (Release, Relaxed) => intrinsics::atomic_cxchg_rel(dst, old, new),
+        (AcqRel, Acquire) => intrinsics::atomic_cxchg_acqrel(dst, old, new),
+        (Relaxed, Relaxed) => intrinsics::atomic_cxchg_relaxed(dst, old, new),
+        (SeqCst, SeqCst) => intrinsics::atomic_cxchg(dst, old, new),
+        (Acquire, Relaxed) => intrinsics::atomic_cxchg_acq_failrelaxed(dst, old, new),
+        (AcqRel, Relaxed) => intrinsics::atomic_cxchg_acqrel_failrelaxed(dst, old, new),
+        (SeqCst, Relaxed) => intrinsics::atomic_cxchg_failrelaxed(dst, old, new),
+        (SeqCst, Acquire) => intrinsics::atomic_cxchg_failacq(dst, old, new),
+
+        // The above block is all copied from libcore, and this statement is
+        // also copied from libcore except that it's a panic in libcore and we
+        // have a little bit more of a lightweight panic here.
+        _ => ::core_arch::x86::ud2(),
+    };
+    val
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
new file mode 100644
index 00000000000..dde2f5d8296
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
@@ -0,0 +1,112 @@
+//! FXSR floating-point context fast save and restor.
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fxsave64"]
+    fn fxsave64(p: *mut u8) -> ();
+    #[link_name = "llvm.x86.fxrstor64"]
+    fn fxrstor64(p: *const u8) -> ();
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave64(mem_addr: *mut u8) {
+    fxsave64(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor64(mem_addr: *const u8) {
+    fxrstor64(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::x86_64::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdsimd_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<FxsaveArea> for FxsaveArea {
+        fn eq(&self, other: &FxsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for FxsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    unsafe fn fxsave64() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave64(a.ptr());
+        fxsr::_fxrstor64(a.ptr());
+        fxsr::_fxsave64(b.ptr());
+        assert_eq!(a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/mod.rs b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
new file mode 100644
index 00000000000..ca16e4f17be
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
@@ -0,0 +1,46 @@
+//! `x86_64` intrinsics
+
+mod fxsr;
+pub use self::fxsr::*;
+
+mod sse;
+pub use self::sse::*;
+
+mod sse2;
+pub use self::sse2::*;
+
+mod sse41;
+pub use self::sse41::*;
+
+mod sse42;
+pub use self::sse42::*;
+
+mod xsave;
+pub use self::xsave::*;
+
+mod abm;
+pub use self::abm::*;
+
+mod avx;
+pub use self::avx::*;
+
+mod bmi;
+pub use self::bmi::*;
+
+mod bmi2;
+pub use self::bmi2::*;
+
+mod avx2;
+pub use self::avx2::*;
+
+mod bswap;
+pub use self::bswap::*;
+
+mod rdrand;
+pub use self::rdrand::*;
+
+mod cmpxchg16b;
+pub use self::cmpxchg16b::*;
+
+mod adx;
+pub use self::adx::*;
diff --git a/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
new file mode 100644
index 00000000000..7cc0d710c81
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
@@ -0,0 +1,43 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.64"]
+    fn x86_rdrand64_step() -> (u64, i32);
+    #[link_name = "llvm.x86.rdseed.64"]
+    fn x86_rdseed64_step() -> (u64, i32);
+}
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Read a hardware generated 64-bit random value and store the result in val.
+/// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand64_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdrand64_step();
+    *val = v;
+    flag
+}
+
+/// Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed64_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdseed64_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse.rs b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
new file mode 100644
index 00000000000..a3126e72e94
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
@@ -0,0 +1,152 @@
+//! `x86_64` Streaming SIMD Extensions (SSE)
+
+use core_arch::x86::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse.cvtss2si64"]
+    fn cvtss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvttss2si64"]
+    fn cvttss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvtsi642ss"]
+    fn cvtsi642ss(a: __m128, b: i64) -> __m128;
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 64 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or trigger an invalid operation
+/// floating point exception if unmasked (see
+/// [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
+    cvtss2si64(a)
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 64 bit integer
+/// with truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or an invalid operation floating
+/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
+    cvttss2si64(a)
+}
+
+/// Convert a 64 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
+/// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
+    cvtsi642ss(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::f32::NAN;
+    use std::i64::MIN;
+
+    use stdsimd_test::simd_test;
+
+    use core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -34),
+            (-34.5, -34),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (NAN, MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvtss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (NAN, MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+            (9.223372e18, MIN),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    pub unsafe fn test_mm_cvtsi64_ss() {
+        let inputs = &[
+            (4555i64, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+            (9223372036854775807, 9.223372e18),
+            (-9223372036854775808, -9.223372e18),
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi64_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
new file mode 100644
index 00000000000..779be0a5930
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@@ -0,0 +1,210 @@
+//! `x86_64`'s Streaming SIMD Extensions 2 (SSE2)
+
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use intrinsics;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse2.cvtsd2si64"]
+    fn cvtsd2si64(a: __m128d) -> i64;
+    #[link_name = "llvm.x86.sse2.cvttsd2si64"]
+    fn cvttsd2si64(a: __m128d) -> i64;
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to
+/// a 64-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
+    cvtsd2si64(a)
+}
+
+/// Alias for `_mm_cvtsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a`
+/// to a 64-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
+    cvttsd2si64(a)
+}
+
+/// Alias for `_mm_cvttsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
+    _mm_cvttsd_si64(a)
+}
+
+/// Stores a 64-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Return a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
+    _mm_set_epi64x(0, a)
+}
+
+/// Return a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
+    _mm_cvtsi64_si128(a)
+}
+
+/// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
+    simd_extract(a.as_i64x2(), 0)
+}
+
+/// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
+    _mm_cvtsi128_si64(a)
+}
+
+/// Return `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
+    simd_insert(a, 0, b as f64)
+}
+
+/// Return `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
+    _mm_cvtsi64_sd(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{f64, i64};
+
+    use stdsimd_test::simd_test;
+
+    use core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64() {
+        let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2_i64);
+
+        let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64x() {
+        let r = _mm_cvtsd_si64x(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si64(a);
+        assert_eq!(r, -1_i64);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64x() {
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si64x(a);
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si64() {
+        let a: i64 = 7;
+        let mut mem = ::std::boxed::Box::<i64>::new(-1);
+        _mm_stream_si64(&mut *mem as *mut i64, a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_si128() {
+        let r = _mm_cvtsi64_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si64() {
+        let r = _mm_cvtsi128_si64(_mm_setr_epi64x(5, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi64_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse41.rs b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
new file mode 100644
index 00000000000..9a22370019c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
@@ -0,0 +1,59 @@
+//! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use core_arch::simd_llvm::*;
+use core_arch::x86::*;
+use mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Extract an 64-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(pextrq, imm8 = 1))]
+#[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
+    let imm8 = (imm8 & 1) as u32;
+    simd_extract(a.as_i64x2(), imm8)
+}
+
+/// Return a copy of `a` with the 64-bit integer from `i` inserted at a
+/// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
+#[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i64x2(), (imm8 & 1) as u32, i))
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arch::x86_64::*;
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let r = _mm_extract_epi64(a, 1);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi64(a, 3);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi64() {
+        let a = _mm_set1_epi64x(0);
+        let e = _mm_setr_epi64x(0, 32);
+        let r = _mm_insert_epi64(a, 32, 1);
+        assert_eq_m128i(r, e);
+        let r = _mm_insert_epi64(a, 32, 3);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse42.rs b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
new file mode 100644
index 00000000000..3f1b6140625
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
@@ -0,0 +1,37 @@
+//! `x86_64`'s Streaming SIMD Extensions 4.2 (SSE4.2)
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse42.crc32.64.64"]
+    fn crc32_64_64(crc: u64, v: u64) -> u64;
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32 value for unsigned 64-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
+    crc32_64_64(crc, v)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arch::x86_64::*;
+
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u64() {
+        let crc = 0x7819dccd3e824;
+        let v = 0x2a22b845fed;
+        let i = _mm_crc32_u64(crc, v);
+        assert_eq!(i, 0xbb6cdc6c);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/xsave.rs b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
new file mode 100644
index 00000000000..875b677dbd2
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
@@ -0,0 +1,227 @@
+//! `x86_64`'s `xsave` and `xsaveopt` target feature intrinsics
+
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.xsave64"]
+    fn xsave64(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xrstor64"]
+    fn xrstor64(p: *const u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsaveopt64"]
+    fn xsaveopt64(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsavec64"]
+    fn xsavec64(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xsaves64"]
+    fn xsaves64(p: *mut u8, hi: u32, lo: u32) -> ();
+    #[link_name = "llvm.x86.xrstors64"]
+    fn xrstors64(p: *const u8, hi: u32, lo: u32) -> ();
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
+    xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
+    xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE64` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt64)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec64)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
+    xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
+    xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
+    xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/209
+// All these tests fail with Intel SDE.
+/*
+#[cfg(test)]
+mod tests {
+    use core_arch::x86::x86_64::xsave;
+    use stdsimd_test::simd_test;
+    use std::fmt;
+
+    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/209
+    #[repr(align(64))]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<XsaveArea> for XsaveArea {
+        fn eq(&self, other: &XsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for XsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "xsave")]
+    unsafe fn xsave64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsave64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsave64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsaveopt")]
+    unsafe fn xsaveopt64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsaveopt64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsaveopt64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsavec")]
+    unsafe fn xsavec64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsavec64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsavec64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsaves")]
+    unsafe fn xsaves64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsaves64(a.ptr(), m);
+        xsave::_xrstors64(a.ptr(), m);
+        xsave::_xsaves64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+}
+*/
diff --git a/library/stdarch/crates/coresimd/tests/cpu-detection.rs b/library/stdarch/crates/core_arch/tests/cpu-detection.rs
index ccbb9eef1c4..454176b18c4 100644
--- a/library/stdarch/crates/coresimd/tests/cpu-detection.rs
+++ b/library/stdarch/crates/core_arch/tests/cpu-detection.rs
@@ -7,7 +7,7 @@
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
-extern crate stdsimd;
+extern crate std_detect;
 
 #[test]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/library/stdarch/crates/stdsimd/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
index aacfe97a3d1..61dd7790ed2 100644
--- a/library/stdarch/crates/stdsimd/Cargo.toml
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "stdsimd"
+name = "std_detect"
 version = "0.1.3"
 authors = [
     "Alex Crichton <alex@alexcrichton.com>",
@@ -11,7 +11,7 @@ documentation = "https://docs.rs/stdsimd"
 homepage = "https://github.com/rust-lang-nursery/stdsimd"
 repository = "https://github.com/rust-lang-nursery/stdsimd"
 readme = "README.md"
-keywords = ["std", "simd", "intrinsics"]
+keywords = ["std", "run-time", "feature", "detection"]
 categories = ["hardware-support"]
 license = "MIT/Apache-2.0"
 
@@ -23,27 +23,10 @@ is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" }
 maintenance = { status = "experimental" }
 
 [dependencies]
-coresimd = { version = "0.1.3", path = "../coresimd" }
 libc = "0.2"
 cfg-if = "0.1"
 
 [dev-dependencies]
+core_arch = { version = "0.1.3", path = "../core_arch" }
 auxv = "0.3.3"
-quickcheck = "0.8"
-rand = "0.6"
-cupid = "0.6.0"
-
-[target.'cfg(target_arch = "wasm32")'.dependencies]
-rand = { version = "0.6", features = ["wasm-bindgen"] }
-
-[[example]]
-name = "hex"
-path = "../../examples/hex.rs"
-
-[[example]]
-name = "wasm"
-crate-type = ["cdylib"]
-path = "../../examples/wasm.rs"
-
-[features]
-default = []
+cupid = "0.6.0"
\ No newline at end of file
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
new file mode 100644
index 00000000000..882c22cc174
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
@@ -0,0 +1,103 @@
+//! Aarch64 run-time features.
+
+/// Checks if `aarch64` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_aarch64_feature_detected {
+    ("neon") => {
+        // FIXME: this should be removed once we rename Aarch64 neon to asimd
+        cfg!(target_feature = "neon") ||
+            $crate::detect::check_for($crate::detect::Feature::asimd)
+    };
+    ("asimd") => {
+        cfg!(target_feature = "neon") ||
+            $crate::detect::check_for($crate::detect::Feature::asimd)
+    };
+    ("pmull") => {
+        cfg!(target_feature = "pmull") ||
+            $crate::detect::check_for($crate::detect::Feature::pmull)
+    };
+    ("fp") => {
+        cfg!(target_feature = "fp") ||
+            $crate::detect::check_for($crate::detect::Feature::fp)
+    };
+    ("fp16") => {
+        cfg!(target_feature = "fp16") ||
+            $crate::detect::check_for($crate::detect::Feature::fp16)
+    };
+    ("sve") => {
+        cfg!(target_feature = "sve") ||
+            $crate::detect::check_for($crate::detect::Feature::sve)
+    };
+    ("crc") => {
+        cfg!(target_feature = "crc") ||
+            $crate::detect::check_for($crate::detect::Feature::crc)
+    };
+    ("crypto") => {
+        cfg!(target_feature = "crypto") ||
+            $crate::detect::check_for($crate::detect::Feature::crypto)
+    };
+    ("lse") => {
+        cfg!(target_feature = "lse") ||
+            $crate::detect::check_for($crate::detect::Feature::lse)
+    };
+    ("rdm") => {
+        cfg!(target_feature = "rdm") ||
+            $crate::detect::check_for($crate::detect::Feature::rdm)
+    };
+    ("rcpc") => {
+        cfg!(target_feature = "rcpc") ||
+            $crate::detect::check_for($crate::detect::Feature::rcpc)
+    };
+    ("dotprod") => {
+        cfg!(target_feature = "dotprod") ||
+            $crate::detect::check_for($crate::detect::Feature::dotprod)
+    };
+    ("ras") => {
+        compile_error!("\"ras\" feature cannot be detected at run-time")
+    };
+    ("v8.1a") => {
+        compile_error!("\"v8.1a\" feature cannot be detected at run-time")
+    };
+    ("v8.2a") => {
+        compile_error!("\"v8.2a\" feature cannot be detected at run-time")
+    };
+    ("v8.3a") => {
+        compile_error!("\"v8.3a\" feature cannot be detected at run-time")
+    };
+    ($t:tt) => { compile_error!(concat!("unknown aarch64 target feature: ", $t)) };
+}
+
+/// ARM Aarch64 CPU Feature enum. Each variant denotes a position in a bitset
+/// for a particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// ARM Advanced SIMD (ASIMD)
+    asimd,
+    /// Polynomial Multiply
+    pmull,
+    /// Floating point support
+    fp,
+    /// Half-float support.
+    fp16,
+    /// Scalable Vector Extension (SVE)
+    sve,
+    /// CRC32 (Cyclic Redundancy Check)
+    crc,
+    /// Crypto: AES + PMULL + SHA1 + SHA2
+    crypto,
+    /// Atomics (Large System Extension)
+    lse,
+    /// Rounding Double Multiply (ASIMDRDM)
+    rdm,
+    /// Release consistent Processor consistent (RcPc)
+    rcpc,
+    /// Vector Dot-Product (ASIMDDP)
+    dotprod,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
new file mode 100644
index 00000000000..cb6ac6badcc
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
@@ -0,0 +1,36 @@
+//! Run-time feature detection on ARM Aarch32.
+
+/// Checks if `arm` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_arm_feature_detected {
+    ("neon") => {
+        cfg!(target_feature = "neon") ||
+            $crate::detect::check_for($crate::detect::Feature::neon)
+    };
+    ("pmull") => {
+        cfg!(target_feature = "pmull") ||
+            $crate::detect::check_for($crate::detect::Feature::pmull)
+    };
+    ("v7") => { compile_error!("\"v7\" feature cannot be detected at run-time") };
+    ("vfp2") => { compile_error!("\"vfp2\" feature cannot be detected at run-time") };
+    ("vfp3") => { compile_error!("\"vfp3\" feature cannot be detected at run-time") };
+    ("vfp4") => { compile_error!("\"vfp4\" feature cannot be detected at run-time") };
+    ($t:tt) => { compile_error!(concat!("unknown arm target feature: ", $t)) };
+}
+
+/// ARM CPU Feature enum. Each variant denotes a position in a bitset for a
+/// particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// ARM Advanced SIMD (NEON) - Aarch32
+    neon,
+    /// Polynomial Multiply
+    pmull,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
new file mode 100644
index 00000000000..876f8dde262
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
@@ -0,0 +1,26 @@
+//! Run-time feature detection on MIPS.
+
+/// Checks if `mips` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_mips_feature_detected {
+    ("msa") => {
+        cfg!(target_feature = "msa") ||
+            $crate::detect::check_for($crate::detect::Feature::msa)
+    };
+    ($t:tt) => { compile_error!(concat!("unknown mips target feature: ", $t)) };
+}
+
+/// MIPS CPU Feature enum. Each variant denotes a position in a bitset for a
+/// particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// MIPS SIMD Architecture (MSA)
+    msa,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
new file mode 100644
index 00000000000..ab837b3d5c9
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
@@ -0,0 +1,26 @@
+//! Run-time feature detection on MIPS64.
+
+/// Checks if `mips64` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_mips64_feature_detected {
+    ("msa") => {
+        cfg!(target_feature = "msa") ||
+            $crate::detect::check_for($crate::detect::Feature::msa)
+    };
+    ($t:tt) => { compile_error!(concat!("unknown mips64 target feature: ", $t)) };
+}
+
+/// MIPS64 CPU Feature enum. Each variant denotes a position in a bitset
+/// for a particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// MIPS SIMD Architecture (MSA)
+    msa,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
new file mode 100644
index 00000000000..9c440b1d6b0
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
@@ -0,0 +1,39 @@
+//! Run-time feature detection on PowerPC.
+
+/// Checks if `powerpc` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_powerpc_feature_detected {
+    ("altivec") => {
+        cfg!(target_feature = "altivec") ||
+            $crate::detect::check_for($crate::detect::Feature::altivec)
+    };
+    ("vsx") => {
+        cfg!(target_feature = "vsx") ||
+            $crate::detect::check_for($crate::detect::Feature::vsx)
+    };
+    ("power8") => {
+        cfg!(target_feature = "power8") ||
+            $crate::detect::check_for($crate::detect::Feature::power8)
+    };
+    ($t:tt) => { compile_error!(concat!("unknown powerpc target feature: ", $t)) };
+}
+
+
+/// PowerPC CPU Feature enum. Each variant denotes a position in a bitset
+/// for a particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// Altivec
+    altivec,
+    /// VSX
+    vsx,
+    /// Power8
+    power8,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
new file mode 100644
index 00000000000..910940f0bb9
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
@@ -0,0 +1,39 @@
+//! Run-time feature detection on PowerPC64.
+
+/// Checks if `powerpc64` feature is enabled.
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+#[allow_internal_unstable]
+macro_rules! is_powerpc64_feature_detected {
+    ("altivec") => {
+        cfg!(target_feature = "altivec") ||
+            $crate::detect::check_for($crate::detect::Feature::altivec)
+    };
+    ("vsx") => {
+        cfg!(target_feature = "vsx") ||
+            $crate::detect::check_for($crate::detect::Feature::vsx)
+    };
+    ("power8") => {
+        cfg!(target_feature = "power8") ||
+            $crate::detect::check_for($crate::detect::Feature::power8)
+    };
+    ($t:tt) => { compile_error!(concat!("unknown powerpc64 target feature: ", $t)) };
+}
+
+
+/// PowerPC64 CPU Feature enum. Each variant denotes a position in a bitset
+/// for a particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// Altivec
+    altivec,
+    /// VSX
+    vsx,
+    /// Power8
+    power8,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/x86.rs b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
new file mode 100644
index 00000000000..3ef8d31d12b
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
@@ -0,0 +1,331 @@
+//! This module implements minimal run-time feature detection for x86.
+//!
+//! The features are detected using the `detect_features` function below.
+//! This function uses the CPUID instruction to read the feature flags from the
+//! CPU and encodes them in an `usize` where each bit position represents
+//! whether a feature is available (bit is set) or unavaiable (bit is cleared).
+//!
+//! The enum `Feature` is used to map bit positions to feature names, and the
+//! the `__crate::detect::check_for!` macro is used to map string literals (e.g.
+//! "avx") to these bit positions (e.g. `Feature::avx`).
+//!
+//!
+//! The run-time feature detection is performed by the
+//! `__crate::detect::check_for(Feature) -> bool` function. On its first call,
+//! this functions queries the CPU for the available features and stores them
+//! in a global `AtomicUsize` variable. The query is performed by just checking
+//! whether the feature bit in this global variable is set or cleared.
+
+/// A macro to test at *runtime* whether a CPU feature is available on
+/// x86/x86-64 platforms.
+///
+/// This macro is provided in the standard library and will detect at runtime
+/// whether the specified CPU feature is detected. This does *not* resolve at
+/// compile time unless the specified feature is already enabled for the entire
+/// crate. Runtime detection currently relies mostly on the `cpuid` instruction.
+///
+/// This macro only takes one argument which is a string literal of the feature
+/// being tested for. The feature names supported are the lowercase versions of
+/// the ones defined by Intel in [their documentation][docs].
+///
+/// ## Supported arguments
+///
+/// This macro supports the same names that `#[target_feature]` supports. Unlike
+/// `#[target_feature]`, however, this macro does not support names separated
+/// with a comma. Instead testing for multiple features must be done through
+/// separate macro invocations for now.
+///
+/// Supported arguments are:
+///
+/// * `"aes"`
+/// * `"pclmulqdq"`
+/// * `"rdrand"`
+/// * `"rdseed"`
+/// * `"tsc"`
+/// * `"mmx"`
+/// * `"sse"`
+/// * `"sse2"`
+/// * `"sse3"`
+/// * `"ssse3"`
+/// * `"sse4.1"`
+/// * `"sse4.2"`
+/// * `"sse4a"`
+/// * `"sha"`
+/// * `"avx"`
+/// * `"avx2"`
+/// * `"avx512f"`
+/// * `"avx512cd"`
+/// * `"avx512er"`
+/// * `"avx512pf"`
+/// * `"avx512bw"`
+/// * `"avx512dq"`
+/// * `"avx512vl"`
+/// * `"avx512ifma"`
+/// * `"avx512vbmi"`
+/// * `"avx512vpopcntdq"`
+/// * `"fma"`
+/// * `"bmi1"`
+/// * `"bmi2"`
+/// * `"abm"`
+/// * `"lzcnt"`
+/// * `"tbm"`
+/// * `"popcnt"`
+/// * `"fxsr"`
+/// * `"xsave"`
+/// * `"xsaveopt"`
+/// * `"xsaves"`
+/// * `"xsavec"`
+///
+/// [docs]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
+#[macro_export]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow_internal_unstable]
+macro_rules! is_x86_feature_detected {
+    ("aes") => {
+        cfg!(target_feature = "aes") || $crate::detect::check_for(
+            $crate::detect::Feature::aes)  };
+    ("pclmulqdq") => {
+        cfg!(target_feature = "pclmulqdq") || $crate::detect::check_for(
+            $crate::detect::Feature::pclmulqdq)  };
+    ("rdrand") => {
+        cfg!(target_feature = "rdrand") || $crate::detect::check_for(
+            $crate::detect::Feature::rdrand)  };
+    ("rdseed") => {
+        cfg!(target_feature = "rdseed") || $crate::detect::check_for(
+            $crate::detect::Feature::rdseed)  };
+    ("tsc") => {
+        cfg!(target_feature = "tsc") || $crate::detect::check_for(
+            $crate::detect::Feature::tsc)  };
+    ("mmx") => {
+        cfg!(target_feature = "mmx") || $crate::detect::check_for(
+            $crate::detect::Feature::mmx)  };
+    ("sse") => {
+        cfg!(target_feature = "sse") || $crate::detect::check_for(
+            $crate::detect::Feature::sse)  };
+    ("sse2") => {
+        cfg!(target_feature = "sse2") || $crate::detect::check_for(
+            $crate::detect::Feature::sse2)
+    };
+    ("sse3") => {
+        cfg!(target_feature = "sse3") || $crate::detect::check_for(
+            $crate::detect::Feature::sse3)
+    };
+    ("ssse3") => {
+        cfg!(target_feature = "ssse3") || $crate::detect::check_for(
+            $crate::detect::Feature::ssse3)
+    };
+    ("sse4.1") => {
+        cfg!(target_feature = "sse4.1") || $crate::detect::check_for(
+            $crate::detect::Feature::sse4_1)
+    };
+    ("sse4.2") => {
+        cfg!(target_feature = "sse4.2") || $crate::detect::check_for(
+            $crate::detect::Feature::sse4_2)
+    };
+    ("sse4a") => {
+        cfg!(target_feature = "sse4a") || $crate::detect::check_for(
+            $crate::detect::Feature::sse4a)
+    };
+    ("sha") => {
+        cfg!(target_feature = "sha") || $crate::detect::check_for(
+            $crate::detect::Feature::sha)
+    };
+    ("avx") => {
+        cfg!(target_feature = "avx") || $crate::detect::check_for(
+            $crate::detect::Feature::avx)
+    };
+    ("avx2") => {
+        cfg!(target_feature = "avx2") || $crate::detect::check_for(
+            $crate::detect::Feature::avx2)
+    };
+    ("avx512f") => {
+        cfg!(target_feature = "avx512f") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512f)
+    };
+    ("avx512cd") => {
+        cfg!(target_feature = "avx512cd") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512cd)
+    };
+    ("avx512er") => {
+        cfg!(target_feature = "avx512er") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512er)
+    };
+    ("avx512pf") => {
+        cfg!(target_feature = "avx512pf") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512pf)
+    };
+    ("avx512bw") => {
+        cfg!(target_feature = "avx512bw") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512bw)
+    };
+    ("avx512dq") => {
+        cfg!(target_feature = "avx512dq") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512dq)
+    };
+    ("avx512vl") => {
+        cfg!(target_Feature = "avx512vl") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512vl)
+    };
+    ("avx512ifma") => {
+        cfg!(target_feature = "avx512ifma") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512_ifma)
+    };
+    ("avx512vbmi") => {
+        cfg!(target_feature = "avx512vbmi") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512_vbmi)
+    };
+    ("avx512vpopcntdq") => {
+        cfg!(target_feature = "avx512vpopcntdq") || $crate::detect::check_for(
+            $crate::detect::Feature::avx512_vpopcntdq)
+    };
+    ("fma") => {
+        cfg!(target_feature = "fma") || $crate::detect::check_for(
+            $crate::detect::Feature::fma)
+    };
+    ("bmi1") => {
+        cfg!(target_feature = "bmi1") || $crate::detect::check_for(
+            $crate::detect::Feature::bmi)
+    };
+    ("bmi2") => {
+        cfg!(target_feature = "bmi2") || $crate::detect::check_for(
+            $crate::detect::Feature::bmi2)
+    };
+    ("abm") => {
+        cfg!(target_feature = "abm") || $crate::detect::check_for(
+            $crate::detect::Feature::abm)
+    };
+    ("lzcnt") => {
+        cfg!(target_feature = "lzcnt") || $crate::detect::check_for(
+            $crate::detect::Feature::abm)
+    };
+    ("tbm") => {
+        cfg!(target_feature = "tbm") || $crate::detect::check_for(
+            $crate::detect::Feature::tbm)
+    };
+    ("popcnt") => {
+        cfg!(target_feature = "popcnt") || $crate::detect::check_for(
+            $crate::detect::Feature::popcnt)
+    };
+    ("fxsr") => {
+        cfg!(target_feature = "fxsr") || $crate::detect::check_for(
+            $crate::detect::Feature::fxsr)
+    };
+    ("xsave") => {
+        cfg!(target_feature = "xsave") || $crate::detect::check_for(
+            $crate::detect::Feature::xsave)
+    };
+    ("xsaveopt") => {
+        cfg!(target_feature = "xsaveopt") || $crate::detect::check_for(
+            $crate::detect::Feature::xsaveopt)
+    };
+    ("xsaves") => {
+        cfg!(target_feature = "xsaves") || $crate::detect::check_for(
+            $crate::detect::Feature::xsaves)
+    };
+    ("xsavec") => {
+        cfg!(target_feature = "xsavec") || $crate::detect::check_for(
+            $crate::detect::Feature::xsavec)
+    };
+    ("cmpxchg16b") => {
+        cfg!(target_feature = "cmpxchg16b") || $crate::detect::check_for(
+            $crate::detect::Feature::cmpxchg16b)
+    };
+    ("adx") => {
+        cfg!(target_feature = "adx") || $crate::detect::check_for(
+            $crate::detect::Feature::adx)
+    };
+    ($t:tt) => {
+        compile_error!(concat!("unknown target feature: ", $t))
+    };
+}
+
+/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a
+/// particular feature.
+///
+/// This is an unstable implementation detail subject to change.
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+#[doc(hidden)]
+#[unstable(feature = "stdsimd_internal", issue = "0")]
+pub enum Feature {
+    /// AES (Advanced Encryption Standard New Instructions AES-NI)
+    aes,
+    /// CLMUL (Carry-less Multiplication)
+    pclmulqdq,
+    /// RDRAND
+    rdrand,
+    /// RDSEED
+    rdseed,
+    /// TSC (Time Stamp Counter)
+    tsc,
+    /// MMX
+    mmx,
+    /// SSE (Streaming SIMD Extensions)
+    sse,
+    /// SSE2 (Streaming SIMD Extensions 2)
+    sse2,
+    /// SSE3 (Streaming SIMD Extensions 3)
+    sse3,
+    /// SSSE3 (Supplemental Streaming SIMD Extensions 3)
+    ssse3,
+    /// SSE4.1 (Streaming SIMD Extensions 4.1)
+    sse4_1,
+    /// SSE4.2 (Streaming SIMD Extensions 4.2)
+    sse4_2,
+    /// SSE4a (Streaming SIMD Extensions 4a)
+    sse4a,
+    /// SHA
+    sha,
+    /// AVX (Advanced Vector Extensions)
+    avx,
+    /// AVX2 (Advanced Vector Extensions 2)
+    avx2,
+    /// AVX-512 F (Foundation)
+    avx512f,
+    /// AVX-512 CD (Conflict Detection Instructions)
+    avx512cd,
+    /// AVX-512 ER (Exponential and Reciprocal Instructions)
+    avx512er,
+    /// AVX-512 PF (Prefetch Instructions)
+    avx512pf,
+    /// AVX-512 BW (Byte and Word Instructions)
+    avx512bw,
+    /// AVX-512 DQ (Doubleword and Quadword)
+    avx512dq,
+    /// AVX-512 VL (Vector Length Extensions)
+    avx512vl,
+    /// AVX-512 IFMA (Integer Fused Multiply Add)
+    avx512_ifma,
+    /// AVX-512 VBMI (Vector Byte Manipulation Instructions)
+    avx512_vbmi,
+    /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and
+    /// Quadword)
+    avx512_vpopcntdq,
+    /// FMA (Fused Multiply Add)
+    fma,
+    /// BMI1 (Bit Manipulation Instructions 1)
+    bmi,
+    /// BMI1 (Bit Manipulation Instructions 2)
+    bmi2,
+    /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero
+    /// Count) on Intel
+    abm,
+    /// TBM (Trailing Bit Manipulation)
+    tbm,
+    /// POPCNT (Population Count)
+    popcnt,
+    /// FXSR (Floating-point context fast save and restor)
+    fxsr,
+    /// XSAVE (Save Processor Extended States)
+    xsave,
+    /// XSAVEOPT (Save Processor Extended States Optimized)
+    xsaveopt,
+    /// XSAVES (Save Processor Extended States Supervisor)
+    xsaves,
+    /// XSAVEC (Save Processor Extended States Compacted)
+    xsavec,
+    /// CMPXCH16B, a 16-byte compare-and-swap instruction
+    cmpxchg16b,
+    /// ADX, Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+    adx,
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/bit.rs b/library/stdarch/crates/std_detect/src/detect/bit.rs
new file mode 100644
index 00000000000..578f0b16b74
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/bit.rs
@@ -0,0 +1,9 @@
+//! Bit manipulation utilities.
+
+/// Tests the `bit` of `x`.
+#[allow(dead_code)]
+#[inline]
+pub(crate) fn test(x: usize, bit: u32) -> bool {
+    debug_assert!(bit < 32, "bit index out-of-bounds");
+    x & (1 << bit) != 0
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/cache.rs b/library/stdarch/crates/std_detect/src/detect/cache.rs
new file mode 100644
index 00000000000..c2de4da7349
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/cache.rs
@@ -0,0 +1,162 @@
+//! Caches run-time feature detection so that it only needs to be computed
+//! once.
+
+#![allow(dead_code)] // not used on all platforms
+
+use core::sync::atomic::Ordering;
+
+#[cfg(target_pointer_width = "64")]
+use core::sync::atomic::AtomicU64;
+
+#[cfg(target_pointer_width = "32")]
+use core::sync::atomic::AtomicU32;
+
+/// Sets the `bit` of `x`.
+#[inline]
+const fn set_bit(x: u64, bit: u32) -> u64 {
+    x | 1 << bit
+}
+
+/// Tests the `bit` of `x`.
+#[inline]
+const fn test_bit(x: u64, bit: u32) -> bool {
+    x & (1 << bit) != 0
+}
+
+/// Maximum number of features that can be cached.
+const CACHE_CAPACITY: u32 = 63;
+
+/// This type is used to initialize the cache
+#[derive(Copy, Clone)]
+pub(crate) struct Initializer(u64);
+
+impl Default for Initializer {
+    fn default() -> Self {
+        Initializer(0)
+    }
+}
+
+impl Initializer {
+    /// Tests the `bit` of the cache.
+    #[allow(dead_code)]
+    #[inline]
+    pub(crate) fn test(self, bit: u32) -> bool {
+        // FIXME: this way of making sure that the cache is large enough is
+        // brittle.
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        test_bit(self.0, bit)
+    }
+
+    /// Sets the `bit` of the cache.
+    #[inline]
+    pub(crate) fn set(&mut self, bit: u32) {
+        // FIXME: this way of making sure that the cache is large enough is
+        // brittle.
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        let v = self.0;
+        self.0 = set_bit(v, bit);
+    }
+}
+
+/// This global variable is a cache of the features supported by the CPU.
+static CACHE: Cache = Cache::uninitialized();
+
+/// Feature cache with capacity for `CACHE_CAPACITY` features.
+///
+/// Note: the last feature bit is used to represent an
+/// uninitialized cache.
+#[cfg(target_pointer_width = "64")]
+struct Cache(AtomicU64);
+
+#[cfg(target_pointer_width = "64")]
+impl Cache {
+    /// Creates an uninitialized cache.
+    const fn uninitialized() -> Self {
+        const X: AtomicU64 = AtomicU64::new(u64::max_value());
+        Self(X)
+    }
+    /// Is the cache uninitialized?
+    #[inline]
+    pub(crate) fn is_uninitialized(&self) -> bool {
+        self.0.load(Ordering::Relaxed) == u64::max_value()
+    }
+
+    /// Is the `bit` in the cache set?
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> bool {
+        test_bit(CACHE.0.load(Ordering::Relaxed), bit)
+    }
+
+    /// Initializes the cache.
+    #[inline]
+    pub(crate) fn initialize(&self, value: Initializer) {
+        self.0.store(value.0, Ordering::Relaxed);
+    }
+}
+
+/// Feature cache with capacity for `CACHE_CAPACITY` features.
+///
+/// Note: the last feature bit is used to represent an
+/// uninitialized cache.
+#[cfg(target_pointer_width = "32")]
+struct Cache(AtomicU32, AtomicU32);
+
+#[cfg(target_pointer_width = "32")]
+impl Cache {
+    /// Creates an uninitialized cache.
+    const fn uninitialized() -> Self {
+        Cache(
+            AtomicU32::new(u32::max_value()),
+            AtomicU32::new(u32::max_value()),
+        )
+    }
+    /// Is the cache uninitialized?
+    #[inline]
+    pub(crate) fn is_uninitialized(&self) -> bool {
+        self.1.load(Ordering::Relaxed) == u32::max_value()
+    }
+
+    /// Is the `bit` in the cache set?
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> bool {
+        if bit < 32 {
+            test_bit(CACHE.0.load(Ordering::Relaxed) as u64, bit)
+        } else {
+            test_bit(CACHE.1.load(Ordering::Relaxed) as u64, bit - 32)
+        }
+    }
+
+    /// Initializes the cache.
+    #[inline]
+    pub(crate) fn initialize(&self, value: Initializer) {
+        let lo: u32 = value.0 as u32;
+        let hi: u32 = (value.0 >> 32) as u32;
+        self.0.store(lo, Ordering::Relaxed);
+        self.1.store(hi, Ordering::Relaxed);
+    }
+}
+
+/// Test the `bit` of the storage. If the storage has not been initialized,
+/// initializes it with the result of `f()`.
+///
+/// On its first invocation, it detects the CPU features and caches them in the
+/// `CACHE` global variable as an `AtomicU64`.
+///
+/// It uses the `Feature` variant to index into this variable as a bitset. If
+/// the bit is set, the feature is enabled, and otherwise it is disabled.
+#[inline]
+pub(crate) fn test<F>(bit: u32, f: F) -> bool
+where
+    F: FnOnce() -> Initializer,
+{
+    if CACHE.is_uninitialized() {
+        CACHE.initialize(f());
+    }
+    CACHE.test(bit)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/error_macros.rs b/library/stdarch/crates/std_detect/src/detect/error_macros.rs
new file mode 100644
index 00000000000..6769757ed93
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/error_macros.rs
@@ -0,0 +1,150 @@
+//! The `is_{target_arch}_feature_detected!` macro are only available on their
+//! architecture. These macros provide a better error messages when the user
+//! attempts to call them in a different architecture.
+
+/// Prevents compilation if `is_x86_feature_detected` is used somewhere
+/// else than `x86` and `x86_64` targets.
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_x86_feature_detected {
+    ($t: tt) => {
+        compile_error!(
+            r#"
+        is_x86_feature_detected can only be used on x86 and x86_64 targets.
+        You can prevent it from being used in other architectures by
+        guarding it behind a cfg(target_arch) as follows:
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                if is_x86_feature_detected(...) { ... }
+            }
+        "#
+        )
+    };
+}
+
+/// Prevents compilation if `is_arm_feature_detected` is used somewhere else
+/// than `ARM` targets.
+#[cfg(not(target_arch = "arm"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_arm_feature_detected {
+    ($t:tt) => {
+        compile_error!(
+            r#"
+        is_arm_feature_detected can only be used on ARM targets.
+        You can prevent it from being used in other architectures by
+        guarding it behind a cfg(target_arch) as follows:
+
+            #[cfg(target_arch = "arm")] {
+                if is_arm_feature_detected(...) { ... }
+            }
+        "#
+        )
+    };
+}
+
+/// Prevents compilation if `is_aarch64_feature_detected` is used somewhere else
+/// than `aarch64` targets.
+#[cfg(not(target_arch = "aarch64"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_aarch64_feature_detected {
+    ($t: tt) => {
+        compile_error!(
+            r#"
+        is_aarch64_feature_detected can only be used on AArch64 targets.
+        You can prevent it from being used in other architectures by
+        guarding it behind a cfg(target_arch) as follows:
+
+            #[cfg(target_arch = "aarch64")] {
+                if is_aarch64_feature_detected(...) { ... }
+            }
+        "#
+        )
+    };
+}
+
+/// Prevents compilation if `is_powerpc_feature_detected` is used somewhere else
+/// than `PowerPC` targets.
+#[cfg(not(target_arch = "powerpc"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_powerpc_feature_detected {
+    ($t:tt) => {
+        compile_error!(
+            r#"
+is_powerpc_feature_detected can only be used on PowerPC targets.
+You can prevent it from being used in other architectures by
+guarding it behind a cfg(target_arch) as follows:
+
+    #[cfg(target_arch = "powerpc")] {
+        if is_powerpc_feature_detected(...) { ... }
+    }
+"#
+        )
+    };
+}
+
+/// Prevents compilation if `is_powerpc64_feature_detected` is used somewhere
+/// else than `PowerPC64` targets.
+#[cfg(not(target_arch = "powerpc64"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_powerpc64_feature_detected {
+    ($t:tt) => {
+        compile_error!(
+            r#"
+is_powerpc64_feature_detected can only be used on PowerPC64 targets.
+You can prevent it from being used in other architectures by
+guarding it behind a cfg(target_arch) as follows:
+
+    #[cfg(target_arch = "powerpc64")] {
+        if is_powerpc64_feature_detected(...) { ... }
+    }
+"#
+        )
+    };
+}
+
+/// Prevents compilation if `is_mips_feature_detected` is used somewhere else
+/// than `MIPS` targets.
+#[cfg(not(target_arch = "mips"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_mips_feature_detected {
+    ($t:tt) => {
+        compile_error!(
+            r#"
+        is_mips_feature_detected can only be used on MIPS targets.
+        You can prevent it from being used in other architectures by
+        guarding it behind a cfg(target_arch) as follows:
+
+            #[cfg(target_arch = "mips")] {
+                if is_mips_feature_detected(...) { ... }
+            }
+        "#
+        )
+    };
+}
+
+/// Prevents compilation if `is_mips64_feature_detected` is used somewhere else
+/// than `MIPS64` targets.
+#[cfg(not(target_arch = "mips64"))]
+#[macro_export]
+#[unstable(feature = "stdsimd", issue = "27731")]
+macro_rules! is_mips64_feature_detected {
+    ($t:tt) => {
+        compile_error!(
+            r#"
+        is_mips64_feature_detected can only be used on MIPS64 targets.
+        You can prevent it from being used in other architectures by
+        guarding it behind a cfg(target_arch) as follows:
+
+            #[cfg(target_arch = "mips64")] {
+                if is_mips64_feature_detected(...) { ... }
+            }
+        "#
+        )
+    };
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/mod.rs b/library/stdarch/crates/std_detect/src/detect/mod.rs
new file mode 100644
index 00000000000..f446e88eedc
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/mod.rs
@@ -0,0 +1,85 @@
+//! This module implements run-time feature detection.
+//!
+//! The `is_{arch}_feature_detected!("feature-name")` macros take the name of a
+//! feature as a string-literal, and return a boolean indicating whether the
+//! feature is enabled at run-time or not.
+//!
+//! These macros do two things:
+//! * map the string-literal into an integer stored as a `Feature` enum,
+//! * call a `os::check_for(x: Feature)` function that returns `true` if the
+//! feature is enabled.
+//!
+//! The `Feature` enums are also implemented in the `arch/{target_arch}.rs`
+//! modules.
+//!
+//! The `check_for` functions are, in general, Operating System dependent. Most
+//! architectures do not allow user-space programs to query the feature bits
+//! due to security concerns (x86 is the big exception). These functions are
+//! implemented in the `os/{target_os}.rs` modules.
+
+#[macro_use]
+mod error_macros;
+
+cfg_if! {
+    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        #[path = "arch/x86.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "arm")] {
+        #[path = "arch/arm.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "aarch64")] {
+        #[path = "arch/aarch64.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "powerpc")] {
+        #[path = "arch/powerpc.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "powerpc64")] {
+        #[path = "arch/powerpc64.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "mips")] {
+        #[path = "arch/mips.rs"]
+        #[macro_use]
+        mod arch;
+    } else if #[cfg(target_arch = "mips64")] {
+        #[path = "arch/mips64.rs"]
+        #[macro_use]
+        mod arch;
+    } else {
+        // Unimplemented architecture:
+        mod arch {
+            pub enum Feature {
+                Null
+            }
+        }
+    }
+}
+pub use self::arch::Feature;
+
+mod bit;
+mod cache;
+
+cfg_if! {
+    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        // On x86/x86_64 no OS specific functionality is required.
+        #[path = "os/x86.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "linux", feature = "use_std"))] {
+        #[path = "os/linux/mod.rs"]
+        mod os;
+    } else if #[cfg(target_os = "freebsd")] {
+        #[cfg(target_arch = "aarch64")]
+        #[path = "os/aarch64.rs"]
+        mod aarch64;
+        #[path = "os/freebsd/mod.rs"]
+        mod os;
+    } else {
+        #[path = "os/other.rs"]
+        mod os;
+    }
+}
+pub use self::os::check_for;
diff --git a/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
new file mode 100644
index 00000000000..f28d15a7c3e
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
@@ -0,0 +1,79 @@
+//! Run-time feature detection for Aarch64 on any OS that emulates the mrs instruction.
+//!
+//! On FreeBSD >= 12.0, Linux >= 4.11 and other operating systems, it is possible to use
+//! privileged system registers from userspace to check CPU feature support.
+//!
+//! AArch64 system registers ID_AA64ISAR0_EL1, ID_AA64PFR0_EL1, ID_AA64ISAR1_EL1
+//! have bits dedicated to features like AdvSIMD, CRC32, AES, atomics (LSE), etc.
+//! Each part of the register indicates the level of support for a certain feature, e.g.
+//! when ID_AA64ISAR0_EL1[7:4] is >= 1, AES is supported; when it's >= 2, PMULL is supported.
+//!
+//! For proper support of [SoCs where different cores have different capabilities](https://medium.com/@jadr2ddude/a-big-little-problem-a-tale-of-big-little-gone-wrong-e7778ce744bb),
+//! the OS has to always report only the features supported by all cores, like [FreeBSD does](https://reviews.freebsd.org/D17137#393947).
+//!
+//! References:
+//!
+//! - [Zircon implementation](https://fuchsia.googlesource.com/zircon/+/master/kernel/arch/arm64/feature.cpp)
+//! - [Linux documentation](https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt)
+
+use crate::detect::{Feature, cache};
+
+/// Try to read the features from the system registers.
+///
+/// This will cause SIGILL if the current OS is not trapping the mrs instruction.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    {
+        let mut enable_feature = |f, enable| {
+            if enable {
+                value.set(f as u32);
+            }
+        };
+
+        // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
+        let aa64isar0: u64;
+        unsafe { asm!("mrs $0, ID_AA64ISAR0_EL1" : "=r"(aa64isar0)); }
+
+        let aes = bits_shift(aa64isar0, 7, 4) >= 1;
+        let pmull = bits_shift(aa64isar0, 7, 4) >= 2;
+        let sha1 = bits_shift(aa64isar0, 11, 8) >= 1;
+        let sha2 = bits_shift(aa64isar0, 15, 12) >= 1;
+        enable_feature(Feature::pmull, pmull);
+        // Crypto is specified as AES + PMULL + SHA1 + SHA2 per LLVM/hosts.cpp
+        enable_feature(Feature::crypto, aes && pmull && sha1 && sha2);
+        enable_feature(Feature::lse, bits_shift(aa64isar0, 23, 20) >= 1);
+        enable_feature(Feature::crc, bits_shift(aa64isar0, 19, 16) >= 1);
+
+        // ID_AA64PFR0_EL1 - Processor Feature Register 0
+        let aa64pfr0: u64;
+        unsafe { asm!("mrs $0, ID_AA64PFR0_EL1" : "=r"(aa64pfr0)); }
+
+        let fp = bits_shift(aa64pfr0, 19, 16) < 0xF;
+        let fphp = bits_shift(aa64pfr0, 19, 16) >= 1;
+        let asimd = bits_shift(aa64pfr0, 23, 20) < 0xF;
+        let asimdhp = bits_shift(aa64pfr0, 23, 20) >= 1;
+        enable_feature(Feature::fp, fp);
+        enable_feature(Feature::fp16, fphp);
+        // SIMD support requires float support - if half-floats are
+        // supported, it also requires half-float support:
+        enable_feature(Feature::asimd, fp && asimd && (!fphp | asimdhp));
+        // SIMD extensions require SIMD support:
+        enable_feature(Feature::rdm, asimd && bits_shift(aa64isar0, 31, 28) >= 1);
+        enable_feature(Feature::dotprod, asimd && bits_shift(aa64isar0, 47, 44) >= 1);
+        enable_feature(Feature::sve, asimd && bits_shift(aa64pfr0, 35, 32) >= 1);
+
+        // ID_AA64ISAR1_EL1 - Instruction Set Attribute Register 1
+        let aa64isar1: u64;
+        unsafe { asm!("mrs $0, ID_AA64ISAR1_EL1" : "=r"(aa64isar1)); }
+
+        enable_feature(Feature::rcpc, bits_shift(aa64isar1, 23, 20) >= 1);
+    }
+
+    value
+}
+
+#[inline]
+fn bits_shift(x: u64, high: usize, low: usize) -> u64 {
+    (x >> low) & ((1 << (high - low + 1)) - 1)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
new file mode 100644
index 00000000000..910d2f33b39
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
@@ -0,0 +1,28 @@
+//! Run-time feature detection for Aarch64 on FreeBSD.
+
+use crate::detect::{Feature, cache};
+use super::super::aarch64::detect_features;
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn dump() {
+        println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+        println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+        println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+        println!("fp16: {:?}", is_aarch64_feature_detected!("fp16"));
+        println!("sve: {:?}", is_aarch64_feature_detected!("sve"));
+        println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+        println!("crypto: {:?}", is_aarch64_feature_detected!("crypto"));
+        println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+        println!("rdm: {:?}", is_aarch64_feature_detected!("rdm"));
+        println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+        println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
new file mode 100644
index 00000000000..1c73cefd47d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
@@ -0,0 +1,14 @@
+//! Run-time feature detection on FreeBSD
+
+cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub use self::aarch64::check_for;
+    } else {
+        use arch::detect::Feature;
+        /// Performs run-time feature detection.
+        pub fn check_for(_x: Feature) -> bool {
+            false
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
new file mode 100644
index 00000000000..f7dc0f0222e
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -0,0 +1,157 @@
+//! Run-time feature detection for Aarch64 on Linux.
+
+use crate::detect::{Feature, cache, bit};
+use super::{auxvec, cpuinfo};
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+fn detect_features() -> cache::Initializer {
+    if let Ok(auxv) = auxvec::auxv() {
+        let hwcap: AtHwcap = auxv.into();
+        return hwcap.cache();
+    }
+    if let Ok(c) = cpuinfo::CpuInfo::new() {
+        let hwcap: AtHwcap = c.into();
+        return hwcap.cache();
+    }
+    cache::Initializer::default()
+}
+
+/// These values are part of the platform-specific [asm/hwcap.h][hwcap] .
+///
+/// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+struct AtHwcap {
+    fp: bool, // 0
+    asimd: bool, // 1
+    // evtstrm: bool, // 2
+    aes: bool, // 3
+    pmull: bool, // 4
+    sha1: bool, // 5
+    sha2: bool, // 6
+    crc32: bool, // 7
+    atomics: bool, // 8
+    fphp: bool, // 9
+    asimdhp: bool, // 10
+    // cpuid: bool, // 11
+    asimdrdm: bool, // 12
+    // jscvt: bool, // 13
+    // fcma: bool, // 14
+    lrcpc: bool, // 15
+    // dcpop: bool, // 16
+    // sha3: bool, // 17
+    // sm3: bool, // 18
+    // sm4: bool, // 19
+    asimddp: bool, // 20
+    // sha512: bool, // 21
+    sve: bool, // 22
+}
+
+impl From<auxvec::AuxVec> for AtHwcap {
+    /// Reads AtHwcap from the auxiliary vector.
+    fn from(auxv: auxvec::AuxVec) -> Self {
+        AtHwcap {
+            fp: bit::test(auxv.hwcap, 0),
+            asimd: bit::test(auxv.hwcap, 1),
+            // evtstrm: bit::test(auxv.hwcap, 2),
+            aes: bit::test(auxv.hwcap, 3),
+            pmull: bit::test(auxv.hwcap, 4),
+            sha1: bit::test(auxv.hwcap, 5),
+            sha2: bit::test(auxv.hwcap, 6),
+            crc32: bit::test(auxv.hwcap, 7),
+            atomics: bit::test(auxv.hwcap, 8),
+            fphp: bit::test(auxv.hwcap, 9),
+            asimdhp: bit::test(auxv.hwcap, 10),
+            // cpuid: bit::test(auxv.hwcap, 11),
+            asimdrdm: bit::test(auxv.hwcap, 12),
+            // jscvt: bit::test(auxv.hwcap, 13),
+            // fcma: bit::test(auxv.hwcap, 14),
+            lrcpc: bit::test(auxv.hwcap, 15),
+            // dcpop: bit::test(auxv.hwcap, 16),
+            // sha3: bit::test(auxv.hwcap, 17),
+            // sm3: bit::test(auxv.hwcap, 18),
+            // sm4: bit::test(auxv.hwcap, 19),
+            asimddp: bit::test(auxv.hwcap, 20),
+            // sha512: bit::test(auxv.hwcap, 21),
+            sve: bit::test(auxv.hwcap, 22),
+        }
+    }
+}
+
+impl From<cpuinfo::CpuInfo> for AtHwcap {
+    /// Reads AtHwcap from /proc/cpuinfo .
+    fn from(c: cpuinfo::CpuInfo) -> Self {
+        let f = &c.field("Features");
+        AtHwcap {
+            // 64-bit names. FIXME: In 32-bit compatibility mode /proc/cpuinfo will
+            // map some of the 64-bit names to some 32-bit feature names. This does not
+            // cover that yet.
+            fp: f.has("fp"),
+            asimd: f.has("asimd"),
+            // evtstrm: f.has("evtstrm"),
+            aes: f.has("aes"),
+            pmull: f.has("pmull"),
+            sha1: f.has("sha1"),
+            sha2: f.has("sha2"),
+            crc32: f.has("crc32"),
+            atomics: f.has("atomics"),
+            fphp: f.has("fphp"),
+            asimdhp: f.has("asimdhp"),
+            // cpuid: f.has("cpuid"),
+            asimdrdm: f.has("asimdrdm"),
+            // jscvt: f.has("jscvt"),
+            // fcma: f.has("fcma"),
+            lrcpc: f.has("lrcpc"),
+            // dcpop: f.has("dcpop"),
+            // sha3: f.has("sha3"),
+            // sm3: f.has("sm3"),
+            // sm4: f.has("sm4"),
+            asimddp: f.has("asimddp"),
+            // sha512: f.has("sha512"),
+            sve: f.has("sve"),
+        }
+    }
+}
+
+impl AtHwcap {
+    /// Initializes the cache from the feature -bits.
+    ///
+    /// The features are enabled approximately like in LLVM host feature detection:
+    /// https://github.com/llvm-mirror/llvm/blob/master/lib/Support/Host.cpp#L1273
+    fn cache(self) -> cache::Initializer {
+        let mut value = cache::Initializer::default();
+        {
+            let mut enable_feature = |f, enable| {
+                if enable {
+                    value.set(f as u32);
+                }
+            };
+
+            enable_feature(Feature::fp, self.fp);
+            // Half-float support requires float support
+            enable_feature(Feature::fp16, self.fp && self.fphp);
+            enable_feature(Feature::pmull, self.pmull);
+            enable_feature(Feature::crc, self.crc32);
+            enable_feature(Feature::lse, self.atomics);
+            enable_feature(Feature::rcpc, self.lrcpc);
+
+            // SIMD support requires float support - if half-floats are
+            // supported, it also requires half-float support:
+            let asimd = self.fp && self.asimd && (!self.fphp | self.asimdhp);
+            enable_feature(Feature::asimd, asimd);
+            // SIMD extensions require SIMD support:
+            enable_feature(Feature::rdm, self.asimdrdm && asimd);
+            enable_feature(Feature::dotprod, self.asimddp && asimd);
+            enable_feature(Feature::sve, self.sve && asimd);
+
+            // Crypto is specified as AES + PMULL + SHA1 + SHA2 per LLVM/hosts.cpp
+            enable_feature(Feature::crypto, self.aes && self.pmull && self.sha1 && self.sha2);
+        }
+        value
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
new file mode 100644
index 00000000000..0d58a847cd6
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
@@ -0,0 +1,49 @@
+//! Run-time feature detection for ARM on Linux.
+
+use crate::detect::{Feature, cache, bit};
+use super::{auxvec, cpuinfo};
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
+        enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
+        return value;
+    }
+
+    if let Ok(c) = cpuinfo::CpuInfo::new() {
+        enable_feature(&mut value, Feature::neon, c.field("Features").has("neon") &&
+            !has_broken_neon(&c));
+        enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull"));
+        return value;
+    }
+    value
+}
+
+/// Is the CPU known to have a broken NEON unit?
+///
+/// See https://crbug.com/341598.
+fn has_broken_neon(cpuinfo: &cpuinfo::CpuInfo) -> bool {
+    cpuinfo.field("CPU implementer") == "0x51"
+        && cpuinfo.field("CPU architecture") == "7"
+        && cpuinfo.field("CPU variant") == "0x1"
+        && cpuinfo.field("CPU part") == "0x04d"
+        && cpuinfo.field("CPU revision") == "0"
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
new file mode 100644
index 00000000000..31c980fd382
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -0,0 +1,270 @@
+//! Parses ELF auxiliary vectors.
+#![cfg_attr(not(target_arch = "aarch64"), allow(dead_code))]
+
+extern crate std;
+use self::std::{prelude::v1::*, fs::File, io::Read};
+
+use core::mem;
+
+/// Key to access the CPU Hardware capabilities bitfield.
+pub(crate) const AT_HWCAP: usize = 16;
+/// Key to access the CPU Hardware capabilities 2 bitfield.
+#[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+pub(crate) const AT_HWCAP2: usize = 26;
+
+/// Cache HWCAP bitfields of the ELF Auxiliary Vector.
+///
+/// If an entry cannot be read all the bits in the bitfield are set to zero.
+/// This should be interpreted as all the features being disabled.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct AuxVec {
+    pub hwcap: usize,
+    #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+    pub hwcap2: usize,
+}
+
+/// ELF Auxiliary Vector
+///
+/// The auxiliary vector is a memory region in a running ELF program's stack
+/// composed of (key: usize, value: usize) pairs.
+///
+/// The keys used in the aux vector are platform dependent. For Linux, they are
+/// defined in [linux/auxvec.h][auxvec_h]. The hardware capabilities of a given
+/// CPU can be queried with the  `AT_HWCAP` and `AT_HWCAP2` keys.
+///
+/// There is no perfect way of reading the auxiliary vector.
+///
+/// - If the `getauxval` is dynamically linked to this binary, it will be used.
+/// - Otherwise, try to read `/proc/self/auxv`.
+/// - If that fails, this function returns an error.
+///
+/// Note that run-time feature detection is not invoked for features that can
+/// be detected at compile-time. Also note that if this function returns an
+/// error, cpuinfo still can (and will) be used to try to perform run-time
+/// feature detecton on some platforms.
+///
+/// For more information about when `getauxval` is available check the great
+/// [`auxv` crate documentation][auxv_docs].
+///
+/// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h
+/// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
+    // Try to call a dynamically-linked getauxval function.
+    if let Ok(hwcap) = getauxval(AT_HWCAP) {
+        // Targets with only AT_HWCAP:
+        #[cfg(any(target_arch = "aarch64", target_arch = "mips",
+                  target_arch = "mips64"))]
+        {
+            if hwcap != 0 {
+                return Ok(AuxVec { hwcap });
+            }
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        {
+            if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
+                if hwcap != 0 && hwcap2 != 0 {
+                    return Ok(AuxVec { hwcap, hwcap2 });
+                }
+            }
+        }
+        drop(hwcap);
+    }
+    // If calling getauxval fails, try to read the auxiliary vector from
+    // its file:
+    auxv_from_file("/proc/self/auxv")
+}
+
+/// Tries to read the `key` from the auxiliary vector by calling the
+/// dynamically-linked `getauxval` function. If the function is not linked,
+/// this function return `Err`.
+fn getauxval(key: usize) -> Result<usize, ()> {
+    use libc;
+    pub type F = unsafe extern "C" fn(usize) -> usize;
+    unsafe {
+        let ptr = libc::dlsym(
+            libc::RTLD_DEFAULT,
+            "getauxval\0".as_ptr() as *const _,
+        );
+        if ptr.is_null() {
+            return Err(());
+        }
+
+        let ffi_getauxval: F = mem::transmute(ptr);
+        Ok(ffi_getauxval(key))
+    }
+}
+
+/// Tries to read the auxiliary vector from the `file`. If this fails, this
+/// function returns `Err`.
+fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
+    let mut file = File::open(file).map_err(|_| ())?;
+
+    // See https://github.com/torvalds/linux/blob/v3.19/include/uapi/linux/auxvec.h
+    //
+    // The auxiliary vector contains at most 32 (key,value) fields: from
+    // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of
+    // 2*32 `usize` elements is enough to read the whole vector.
+    let mut buf = [0_usize; 64];
+    {
+        let raw: &mut [u8; 64 * mem::size_of::<usize>()] =
+            unsafe { mem::transmute(&mut buf) };
+        file.read(raw).map_err(|_| ())?;
+    }
+    auxv_from_buf(&buf)
+}
+
+/// Tries to interpret the `buffer` as an auxiliary vector. If that fails, this
+/// function returns `Err`.
+fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
+    // Targets with only AT_HWCAP:
+    #[cfg(any(target_arch = "aarch64", target_arch = "mips",
+              target_arch = "mips64"))]
+    {
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_HWCAP => return Ok(AuxVec { hwcap: el[1] }),
+                _ => (),
+            }
+        }
+    }
+    // Targets with AT_HWCAP and AT_HWCAP2:
+    #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+    {
+        let mut hwcap = None;
+        let mut hwcap2 = None;
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_HWCAP => hwcap = Some(el[1]),
+                AT_HWCAP2 => hwcap2 = Some(el[1]),
+                _ => (),
+            }
+        }
+
+        if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) {
+            return Ok(AuxVec { hwcap, hwcap2 });
+        }
+    }
+    drop(buf);
+    Err(())
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate auxv as auxv_crate;
+    use super::*;
+
+    // Reads the Auxiliary Vector key from /proc/self/auxv
+    // using the auxv crate.
+    fn auxv_crate_getprocfs(key: usize) -> Option<usize> {
+        use self::auxv_crate::AuxvType;
+        use self::auxv_crate::procfs::search_procfs_auxv;
+        let k = key as AuxvType;
+        match search_procfs_auxv(&[k]) {
+            Ok(v) => Some(v[&k] as usize),
+            Err(_) => None,
+        }
+    }
+
+    // Reads the Auxiliary Vector key from getauxval()
+    // using the auxv crate.
+    #[cfg(not(any(target_arch = "mips", target_arch = "mips64")))]
+    fn auxv_crate_getauxval(key: usize) -> Option<usize> {
+        use self::auxv_crate::AuxvType;
+        use self::auxv_crate::getauxval::Getauxval;
+        let q = auxv_crate::getauxval::NativeGetauxval {};
+        match q.getauxval(key as AuxvType) {
+            Ok(v) => Some(v as usize),
+            Err(_) => None,
+        }
+    }
+
+    // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
+    // does not always contain the AT_HWCAP key under qemu.
+    #[cfg(not(any(target_arch = "mips", target_arch = "mips64", target_arch = "powerpc")))]
+    #[test]
+    fn auxv_crate() {
+        let v = auxv();
+        if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) {
+            let rt_hwcap = v.expect("failed to find hwcap key").hwcap;
+            assert_eq!(rt_hwcap, hwcap);
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        {
+            if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) {
+                let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
+                assert_eq!(rt_hwcap2, hwcap2);
+            }
+        }
+    }
+
+    #[test]
+    fn auxv_dump() {
+        if let Ok(auxvec) = auxv() {
+            println!("{:?}", auxvec);
+        } else {
+            println!("both getauxval() and reading /proc/self/auxv failed!");
+        }
+    }
+
+    cfg_if! {
+        if #[cfg(target_arch = "arm")] {
+            #[test]
+            fn linux_rpi3() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-rpi3.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 4174038);
+                assert_eq!(v.hwcap2, 16);
+            }
+
+            #[test]
+            #[should_panic]
+            fn linux_macos_vb() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                // this file is incomplete (contains hwcap but not hwcap2), we
+                // want to fall back to /proc/cpuinfo in this case, so
+                // reading should fail. assert_eq!(v.hwcap, 126614527);
+                // assert_eq!(v.hwcap2, 0);
+            }
+        } else if #[cfg(target_arch = "aarch64")] {
+            #[test]
+            fn linux_x64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-x64-i7-6850k.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 3219913727);
+            }
+        }
+    }
+
+    #[test]
+    fn auxv_dump_procfs() {
+        if let Ok(auxvec) = auxv_from_file("/proc/self/auxv") {
+            println!("{:?}", auxvec);
+        } else {
+            println!("reading /proc/self/auxv failed!");
+        }
+    }
+
+    #[test]
+    fn auxv_crate_procfs() {
+        let v = auxv();
+        if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
+            assert_eq!(v.unwrap().hwcap, hwcap);
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        {
+            if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) {
+                assert_eq!(v.unwrap().hwcap2, hwcap2);
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs
new file mode 100644
index 00000000000..b3168578537
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs
@@ -0,0 +1,301 @@
+//! Parses /proc/cpuinfo
+#![cfg_attr(not(target_arch = "arm"), allow(dead_code))]
+
+extern crate std;
+use self::std::{prelude::v1::*, fs::File, io, io::Read};
+
+/// cpuinfo
+pub(crate) struct CpuInfo {
+    raw: String,
+}
+
+impl CpuInfo {
+    /// Reads /proc/cpuinfo into CpuInfo.
+    pub(crate) fn new() -> Result<Self, io::Error> {
+        let mut file = File::open("/proc/cpuinfo")?;
+        let mut cpui = Self { raw: String::new() };
+        file.read_to_string(&mut cpui.raw)?;
+        Ok(cpui)
+    }
+    /// Returns the value of the cpuinfo `field`.
+    pub(crate) fn field(&self, field: &str) -> CpuInfoField {
+        for l in self.raw.lines() {
+            if l.trim().starts_with(field) {
+                return CpuInfoField::new(l.split(": ").nth(1));
+            }
+        }
+        CpuInfoField(None)
+    }
+
+    /// Returns the `raw` contents of `/proc/cpuinfo`
+    #[cfg(test)]
+    fn raw(&self) -> &String {
+        &self.raw
+    }
+
+    #[cfg(test)]
+    fn from_str(other: &str) -> Result<Self, ::std::io::Error> {
+        Ok(Self {
+            raw: String::from(other),
+        })
+    }
+}
+
+/// Field of cpuinfo
+#[derive(Debug)]
+pub(crate) struct CpuInfoField<'a>(Option<&'a str>);
+
+impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
+    fn eq(&self, other: &&'a str) -> bool {
+        match self.0 {
+            None => other.is_empty(),
+            Some(f) => f == other.trim(),
+        }
+    }
+}
+
+impl<'a> CpuInfoField<'a> {
+    pub(crate) fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> {
+        match v {
+            None => CpuInfoField::<'b>(None),
+            Some(f) => CpuInfoField::<'b>(Some(f.trim())),
+        }
+    }
+    /// Does the field exist?
+    #[cfg(test)]
+    pub(crate) fn exists(&self) -> bool {
+        self.0.is_some()
+    }
+    /// Does the field contain `other`?
+    pub(crate) fn has(&self, other: &str) -> bool {
+        match self.0 {
+            None => other.is_empty(),
+            Some(f) => {
+                let other = other.trim();
+                for v in f.split(' ') {
+                    if v == other {
+                        return true;
+                    }
+                }
+                false
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn raw_dump() {
+        let cpuinfo = CpuInfo::new().unwrap();
+        if cpuinfo.field("vendor_id") == "GenuineIntel" {
+            assert!(cpuinfo.field("flags").exists());
+            assert!(!cpuinfo.field("vendor33_id").exists());
+            assert!(cpuinfo.field("flags").has("sse"));
+            assert!(!cpuinfo.field("flags").has("avx314"));
+        }
+        println!("{}", cpuinfo.raw());
+    }
+
+    const CORE_DUO_T6500: &str = r"processor       : 0
+vendor_id       : GenuineIntel
+cpu family      : 6
+model           : 23
+model name      : Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz
+stepping        : 10
+microcode       : 0xa0b
+cpu MHz         : 1600.000
+cache size      : 2048 KB
+physical id     : 0
+siblings        : 2
+core id         : 0
+cpu cores       : 2
+apicid          : 0
+initial apicid  : 0
+fdiv_bug        : no
+hlt_bug         : no
+f00f_bug        : no
+coma_bug        : no
+fpu             : yes
+fpu_exception   : yes
+cpuid level     : 13
+wp              : yes
+flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm
+bogomips        : 4190.43
+clflush size    : 64
+cache_alignment : 64
+address sizes   : 36 bits physical, 48 bits virtual
+power management:
+";
+
+    #[test]
+    fn core_duo_t6500() {
+        let cpuinfo = CpuInfo::from_str(CORE_DUO_T6500).unwrap();
+        assert_eq!(cpuinfo.field("vendor_id"), "GenuineIntel");
+        assert_eq!(cpuinfo.field("cpu family"), "6");
+        assert_eq!(cpuinfo.field("model"), "23");
+        assert_eq!(
+            cpuinfo.field("model name"),
+            "Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz"
+        );
+        assert_eq!(
+            cpuinfo.field("flags"),
+            "fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm"
+        );
+        assert!(cpuinfo.field("flags").has("fpu"));
+        assert!(cpuinfo.field("flags").has("dtherm"));
+        assert!(cpuinfo.field("flags").has("sse2"));
+        assert!(!cpuinfo.field("flags").has("avx"));
+    }
+
+    const ARM_CORTEX_A53: &str =
+        r"Processor   : AArch64 Processor rev 3 (aarch64)
+        processor   : 0
+        processor   : 1
+        processor   : 2
+        processor   : 3
+        processor   : 4
+        processor   : 5
+        processor   : 6
+        processor   : 7
+        Features    : fp asimd evtstrm aes pmull sha1 sha2 crc32
+        CPU implementer : 0x41
+        CPU architecture: AArch64
+        CPU variant : 0x0
+        CPU part    : 0xd03
+        CPU revision    : 3
+
+        Hardware    : HiKey Development Board
+        ";
+
+    #[test]
+    fn arm_cortex_a53() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A53).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "AArch64 Processor rev 3 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd evtstrm aes pmull sha1 sha2 crc32"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(!cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+
+    const ARM_CORTEX_A57: &str = r"Processor	: Cortex A57 Processor rev 1 (aarch64)
+processor	: 0
+processor	: 1
+processor	: 2
+processor	: 3
+Features	: fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt
+CPU implementer	: 0x41
+CPU architecture: 8
+CPU variant	: 0x1
+CPU part	: 0xd07
+CPU revision	: 1";
+
+    #[test]
+    fn arm_cortex_a57() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A57).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "Cortex A57 Processor rev 1 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+
+    const POWER8E_POWERKVM: &str = r"processor       : 0
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 1
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 2
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 3
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+timebase        : 512000000
+platform        : pSeries
+model           : IBM pSeries (emulated by qemu)
+machine         : CHRP IBM pSeries (emulated by qemu)";
+
+    #[test]
+    fn power8_powerkvm() {
+        let cpuinfo = CpuInfo::from_str(POWER8E_POWERKVM).unwrap();
+        assert_eq!(cpuinfo.field("cpu"), "POWER8E (raw), altivec supported");
+
+        assert!(cpuinfo.field("cpu").has("altivec"));
+    }
+
+    const POWER5P: &str = r"processor       : 0
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 1
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 2
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 3
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 4
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 5
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 6
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 7
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+timebase        : 237331000
+platform        : pSeries
+machine         : CHRP IBM,9133-55A";
+
+    #[test]
+    fn power5p() {
+        let cpuinfo = CpuInfo::from_str(POWER5P).unwrap();
+        assert_eq!(cpuinfo.field("cpu"), "POWER5+ (gs)");
+
+        assert!(!cpuinfo.field("cpu").has("altivec"));
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
new file mode 100644
index 00000000000..7c180326feb
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
@@ -0,0 +1,31 @@
+//! Run-time feature detection for MIPS on Linux.
+
+use crate::detect::{Feature, cache, bit};
+use super::auxvec;
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::msa, bit::test(auxv.hwcap, 1));
+        return value;
+    }
+    // TODO: fall back via cpuinfo
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
new file mode 100644
index 00000000000..642dfb46571
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
@@ -0,0 +1,26 @@
+//! Run-time feature detection on Linux
+
+mod auxvec;
+mod cpuinfo;
+
+cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub use self::aarch64::check_for;
+    } else if #[cfg(target_arch = "arm")] {
+        mod arm;
+        pub use self::arm::check_for;
+    } else  if #[cfg(any(target_arch = "mips", target_arch = "mips64"))] {
+        mod mips;
+        pub use self::mips::check_for;
+    } else if #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))] {
+        mod powerpc;
+        pub use self::powerpc::check_for;
+    } else {
+        use crate::detect::Feature;
+        /// Performs run-time feature detection.
+        pub fn check_for(_x: Feature) -> bool {
+            false
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
new file mode 100644
index 00000000000..0022a7db983
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
@@ -0,0 +1,41 @@
+//! Run-time feature detection for PowerPC on Linux.
+
+use crate::detect::{Feature, cache};
+use super::{auxvec, cpuinfo};
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/cputable.h][cputable]
+    //
+    // [cputable]: https://github.com/torvalds/linux/blob/master/arch/powerpc/include/uapi/asm/cputable.h
+    if let Ok(auxv) = auxvec::auxv() {
+        // note: the PowerPC values are the mask to do the test (instead of the
+        // index of the bit to test like in ARM and Aarch64)
+        enable_feature(&mut value, Feature::altivec, auxv.hwcap & 0x10000000 != 0);
+        enable_feature(&mut value, Feature::vsx, auxv.hwcap & 0x00000080 != 0);
+        enable_feature(&mut value, Feature::power8, auxv.hwcap & 0x80000000 != 0);
+        return value;
+    }
+
+    // PowerPC's /proc/cpuinfo lacks a proper Feature field,
+    // but `altivec` support is indicated in the `cpu` field.
+    if let Ok(c) = cpuinfo::CpuInfo::new() {
+        enable_feature(&mut value, Feature::altivec, c.field("cpu").has("altivec"));
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/other.rs b/library/stdarch/crates/std_detect/src/detect/os/other.rs
new file mode 100644
index 00000000000..23e399ea790
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/other.rs
@@ -0,0 +1,9 @@
+//! Other operating systems
+
+use crate::detect::Feature;
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(_x: Feature) -> bool {
+    false
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
new file mode 100644
index 00000000000..9237d5dc0a5
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@@ -0,0 +1,357 @@
+//! x86 run-time feature detection is OS independent.
+
+use core::{prelude::v1::*, mem};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::detect::{Feature, cache, bit};
+
+/// Performs run-time feature detection.
+#[inline]
+pub fn check_for(x: Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
+
+/// Run-time feature detection on x86 works by using the CPUID instruction.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains
+/// all the information about which flags to set to query which values, and in
+/// which registers these are reported.
+///
+/// The definitive references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
+fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    // If the x86 CPU does not support the CPUID instruction then it is too
+    // old to support any of the currently-detectable features.
+    if !has_cpuid() {
+        return value;
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum
+    // leaf value for subsequent calls of `cpuinfo` in range [0,
+    // 0x8000_0000]. - The vendor ID is stored in 12 u8 ascii chars,
+    // returned in EBX, EDX, and   ECX (in that order):
+    let (max_basic_leaf, vendor_id) = unsafe {
+        let CpuidResult {
+            eax: max_basic_leaf,
+            ebx,
+            ecx,
+            edx,
+        } = __cpuid(0);
+        let vendor_id: [[u8; 4]; 3] = [
+            mem::transmute(ebx),
+            mem::transmute(edx),
+            mem::transmute(ecx),
+        ];
+        let vendor_id: [u8; 12] = mem::transmute(vendor_id);
+        (max_basic_leaf, vendor_id)
+    };
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult {
+        ecx: proc_info_ecx,
+        edx: proc_info_edx,
+        ..
+    } = unsafe { __cpuid(0x0000_0001_u32) };
+
+    // EAX = 7, ECX = 0: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7
+    {
+        let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        (ebx, ecx)
+    } else {
+        (0, 0) // CPUID does not support "Extended Features"
+    };
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    // `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let CpuidResult {
+        eax: extended_max_basic_leaf,
+        ..
+    } = unsafe { __cpuid(0x8000_0000_u32) };
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
+    // Bits"
+    let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        ecx
+    } else {
+        0
+    };
+
+    {
+        // borrows value till the end of this scope:
+        let mut enable = |r, rb, f| {
+            if bit::test(r as usize, rb) {
+                value.set(f as u32);
+            }
+        };
+
+        enable(proc_info_ecx, 0, Feature::sse3);
+        enable(proc_info_ecx, 9, Feature::ssse3);
+        enable(proc_info_ecx, 13, Feature::cmpxchg16b);
+        enable(proc_info_ecx, 19, Feature::sse4_1);
+        enable(proc_info_ecx, 20, Feature::sse4_2);
+        enable(proc_info_ecx, 23, Feature::popcnt);
+        enable(proc_info_ecx, 25, Feature::aes);
+        enable(proc_info_ecx, 1, Feature::pclmulqdq);
+        enable(proc_info_ecx, 30, Feature::rdrand);
+        enable(extended_features_ebx, 18, Feature::rdseed);
+        enable(extended_features_ebx, 19, Feature::adx);
+        enable(proc_info_edx, 4, Feature::tsc);
+        enable(proc_info_edx, 23, Feature::mmx);
+        enable(proc_info_edx, 24, Feature::fxsr);
+        enable(proc_info_edx, 25, Feature::sse);
+        enable(proc_info_edx, 26, Feature::sse2);
+        enable(extended_features_ebx, 29, Feature::sha);
+
+        enable(extended_features_ebx, 3, Feature::bmi);
+        enable(extended_features_ebx, 8, Feature::bmi2);
+
+        // `XSAVE` and `AVX` support:
+        let cpu_xsave = bit::test(proc_info_ecx as usize, 26);
+        if cpu_xsave {
+            // 0. Here the CPU supports `XSAVE`.
+
+            // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+            // supports saving the state of the AVX/AVX2 vector registers on
+            // context-switches, see:
+            //
+            // - [intel: is avx enabled?][is_avx_enabled],
+            // - [mozilla: sse.cpp][mozilla_sse_cpp].
+            //
+            // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+            // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+            let cpu_osxsave = bit::test(proc_info_ecx as usize, 27);
+
+            if cpu_osxsave {
+                // 2. The OS must have signaled the CPU that it supports saving and
+                // restoring the:
+                //
+                // * SSE -> `XCR0.SSE[1]`
+                // * AVX -> `XCR0.AVX[2]`
+                // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+                //
+                // by setting the corresponding bits of `XCR0` to `1`.
+                //
+                // This is safe because the CPU supports `xsave`
+                // and the OS has set `osxsave`.
+                let xcr0 = unsafe { _xgetbv(0) };
+                // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+                let os_avx_support = xcr0 & 6 == 6;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
+                let os_avx512_support = xcr0 & 224 == 224;
+
+                // Only if the OS and the CPU support saving/restoring the AVX
+                // registers we enable `xsave` support:
+                if os_avx_support {
+                    // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                    // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                    // Developer’s Manual, Volume 1: Basic Architecture":
+                    //
+                    // "Software enables the XSAVE feature set by setting
+                    // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                    // instruction). If this bit is 0, execution of any of XGETBV,
+                    // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                    // causes an invalid-opcode exception (#UD)"
+                    //
+                    enable(proc_info_ecx, 26, Feature::xsave);
+
+                    // For `xsaveopt`, `xsavec`, and `xsaves` we need to query:
+                    // Processor Extended State Enumeration Sub-leaf (EAX = 0DH,
+                    // ECX = 1):
+                    if max_basic_leaf >= 0xd {
+                        let CpuidResult {
+                            eax: proc_extended_state1_eax,
+                            ..
+                        } = unsafe { __cpuid_count(0xd_u32, 1) };
+                        enable(proc_extended_state1_eax, 0, Feature::xsaveopt);
+                        enable(proc_extended_state1_eax, 1, Feature::xsavec);
+                        enable(proc_extended_state1_eax, 3, Feature::xsaves);
+                    }
+
+                    // FMA (uses 256-bit wide registers):
+                    enable(proc_info_ecx, 12, Feature::fma);
+
+                    // And AVX/AVX2:
+                    enable(proc_info_ecx, 28, Feature::avx);
+                    enable(extended_features_ebx, 5, Feature::avx2);
+
+                    // For AVX-512 the OS also needs to support saving/restoring
+                    // the extended state, only then we enable AVX-512 support:
+                    if os_avx512_support {
+                        enable(extended_features_ebx, 16, Feature::avx512f);
+                        enable(extended_features_ebx, 17, Feature::avx512dq);
+                        enable(extended_features_ebx, 21, Feature::avx512_ifma);
+                        enable(extended_features_ebx, 26, Feature::avx512pf);
+                        enable(extended_features_ebx, 27, Feature::avx512er);
+                        enable(extended_features_ebx, 28, Feature::avx512cd);
+                        enable(extended_features_ebx, 30, Feature::avx512bw);
+                        enable(extended_features_ebx, 31, Feature::avx512vl);
+                        enable(extended_features_ecx, 1, Feature::avx512_vbmi);
+                        enable(
+                            extended_features_ecx,
+                            14,
+                            Feature::avx512_vpopcntdq,
+                        );
+                    }
+                }
+            }
+        }
+
+        // This detects ABM on AMD CPUs and LZCNT on Intel CPUs.
+        // On intel CPUs with popcnt, lzcnt implements the
+        // "missing part" of ABM, so we map both to the same
+        // internal feature.
+        //
+        // The `is_x86_feature_detected!("lzcnt")` macro then
+        // internally maps to Feature::abm.
+        enable(extended_proc_info_ecx, 5, Feature::abm);
+        if vendor_id == *b"AuthenticAMD" {
+            // These features are only available on AMD CPUs:
+            enable(extended_proc_info_ecx, 6, Feature::sse4a);
+            enable(extended_proc_info_ecx, 21, Feature::tbm);
+        }
+    }
+
+    value
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate cupid;
+
+    #[test]
+    fn dump() {
+        println!("aes: {:?}", is_x86_feature_detected!("aes"));
+        println!("pclmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq"));
+        println!("rdrand: {:?}", is_x86_feature_detected!("rdrand"));
+        println!("rdseed: {:?}", is_x86_feature_detected!("rdseed"));
+        println!("tsc: {:?}", is_x86_feature_detected!("tsc"));
+        println!("sse: {:?}", is_x86_feature_detected!("sse"));
+        println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
+        println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
+        println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
+        println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+        println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
+        println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
+        println!("sha: {:?}", is_x86_feature_detected!("sha"));
+        println!("avx: {:?}", is_x86_feature_detected!("avx"));
+        println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
+        println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
+        println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
+        println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
+        println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
+        println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
+        println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
+        println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
+        println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
+        println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
+        println!(
+            "avx512_vpopcntdq {:?}",
+            is_x86_feature_detected!("avx512vpopcntdq")
+        );
+        println!("fma: {:?}", is_x86_feature_detected!("fma"));
+        println!("abm: {:?}", is_x86_feature_detected!("abm"));
+        println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
+        println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
+        println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+        println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
+        println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+        println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
+        println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
+        println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+        println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+        println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
+        println!("cmpxchg16b: {:?}", is_x86_feature_detected!("cmpxchg16b"));
+        println!("adx: {:?}", is_x86_feature_detected!("adx"));
+    }
+
+    #[test]
+    fn compare_with_cupid() {
+        let information = cupid::master().unwrap();
+        assert_eq!(is_x86_feature_detected!("aes"), information.aesni());
+        assert_eq!(is_x86_feature_detected!("pclmulqdq"), information.pclmulqdq());
+        assert_eq!(is_x86_feature_detected!("rdrand"), information.rdrand());
+        assert_eq!(is_x86_feature_detected!("rdseed"), information.rdseed());
+        assert_eq!(is_x86_feature_detected!("tsc"), information.tsc());
+        assert_eq!(is_x86_feature_detected!("sse"), information.sse());
+        assert_eq!(is_x86_feature_detected!("sse2"), information.sse2());
+        assert_eq!(is_x86_feature_detected!("sse3"), information.sse3());
+        assert_eq!(is_x86_feature_detected!("ssse3"), information.ssse3());
+        assert_eq!(is_x86_feature_detected!("sse4.1"), information.sse4_1());
+        assert_eq!(is_x86_feature_detected!("sse4.2"), information.sse4_2());
+        assert_eq!(is_x86_feature_detected!("sse4a"), information.sse4a());
+        assert_eq!(is_x86_feature_detected!("sha"), information.sha());
+        assert_eq!(is_x86_feature_detected!("avx"), information.avx());
+        assert_eq!(is_x86_feature_detected!("avx2"), information.avx2());
+        assert_eq!(is_x86_feature_detected!("avx512f"), information.avx512f());
+        assert_eq!(is_x86_feature_detected!("avx512cd"), information.avx512cd());
+        assert_eq!(is_x86_feature_detected!("avx512er"), information.avx512er());
+        assert_eq!(is_x86_feature_detected!("avx512pf"), information.avx512pf());
+        assert_eq!(is_x86_feature_detected!("avx512bw"), information.avx512bw());
+        assert_eq!(is_x86_feature_detected!("avx512dq"), information.avx512dq());
+        assert_eq!(is_x86_feature_detected!("avx512vl"), information.avx512vl());
+        assert_eq!(
+            is_x86_feature_detected!("avx512ifma"),
+            information.avx512_ifma()
+        );
+        assert_eq!(
+            is_x86_feature_detected!("avx512vbmi"),
+            information.avx512_vbmi()
+        );
+        assert_eq!(
+            is_x86_feature_detected!("avx512vpopcntdq"),
+            information.avx512_vpopcntdq()
+        );
+        assert_eq!(is_x86_feature_detected!("fma"), information.fma());
+        assert_eq!(is_x86_feature_detected!("bmi1"), information.bmi1());
+        assert_eq!(is_x86_feature_detected!("bmi2"), information.bmi2());
+        assert_eq!(is_x86_feature_detected!("popcnt"), information.popcnt());
+        assert_eq!(is_x86_feature_detected!("abm"), information.lzcnt());
+        assert_eq!(is_x86_feature_detected!("tbm"), information.tbm());
+        assert_eq!(is_x86_feature_detected!("lzcnt"), information.lzcnt());
+        assert_eq!(is_x86_feature_detected!("xsave"), information.xsave());
+        assert_eq!(is_x86_feature_detected!("xsaveopt"), information.xsaveopt());
+        assert_eq!(
+            is_x86_feature_detected!("xsavec"),
+            information.xsavec_and_xrstor()
+        );
+        assert_eq!(
+            is_x86_feature_detected!("xsaves"),
+            information.xsaves_xrstors_and_ia32_xss()
+        );
+        assert_eq!(
+            is_x86_feature_detected!("cmpxchg16b"),
+            information.cmpxchg16b(),
+        );
+        assert_eq!(
+            is_x86_feature_detected!("adx"),
+            information.adx(),
+        );
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv
new file mode 100644
index 00000000000..0538e661f63
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv
Binary files differdiff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
new file mode 100644
index 00000000000..6afe1b3b46a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
Binary files differdiff --git a/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv
new file mode 100644
index 00000000000..75abc02d178
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv
Binary files differdiff --git a/library/stdarch/crates/std_detect/src/lib.rs b/library/stdarch/crates/std_detect/src/lib.rs
new file mode 100644
index 00000000000..af7fc3bdc42
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/lib.rs
@@ -0,0 +1,37 @@
+//! Run-time feature detection for the Rust standard library.
+//!
+//! To detect whether a feature is enabled in the system running the binary
+//! use one of the appropriate macro for the target:
+//!
+//! * `x86` and `x86_64`: [`is_x86_feature_detected`]
+//! * `arm`: [`is_arm_feature_detected`]
+//! * `aarch64`: [`is_aarch64_feature_detected`]
+//! * `mips`: [`is_mips_feature_detected`]
+//! * `mips64`: [`is_mips64_feature_detected`]
+//! * `powerpc`: [`is_powerpc_feature_detected`]
+//! * `powerpc64`: [`is_powerpc64_feature_detected`]
+
+#![unstable(feature = "stdsimd", issue = "27731")]
+#![feature(const_fn, integer_atomics, staged_api, stdsimd)]
+#![feature(doc_cfg, allow_internal_unstable)]
+#![cfg_attr(feature = "cargo-clippy", allow(clippy::shadow_reuse))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    deny(clippy::missing_inline_in_public_items,)
+)]
+#![cfg_attr(target_os = "linux", feature(linkage))]
+#![cfg_attr(all(target_os = "freebsd", target_arch = "aarch64"), feature(asm))]
+#![no_std]
+
+#[cfg(test)]
+#[macro_use(println)]
+extern crate std;
+
+extern crate libc;
+
+#[macro_use]
+extern crate cfg_if;
+
+#[doc(hidden)]
+#[unstable(feature = "stdsimd", issue = "27731")]
+pub mod detect;
diff --git a/library/stdarch/crates/std_detect/src/mod.rs b/library/stdarch/crates/std_detect/src/mod.rs
new file mode 100644
index 00000000000..b630e7ff383
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/mod.rs
@@ -0,0 +1,5 @@
+//! `std_detect`
+
+#[doc(hidden)] // unstable implementation detail
+#[unstable(feature = "stdsimd", issue = "27731")]
+pub mod detect;
diff --git a/library/stdarch/crates/stdsimd/tests/cpu-detection.rs b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
index 636d530ef9a..85beeee63ac 100644
--- a/library/stdarch/crates/stdsimd/tests/cpu-detection.rs
+++ b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
@@ -14,7 +14,7 @@
     target_arch = "powerpc64"
 ))]
 #[macro_use]
-extern crate stdsimd;
+extern crate std_detect;
 
 #[test]
 #[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android")))]
diff --git a/library/stdarch/crates/stdsimd-verify/build.rs b/library/stdarch/crates/stdsimd-verify/build.rs
index 314de67e40b..c0dc81b6a61 100644
--- a/library/stdarch/crates/stdsimd-verify/build.rs
+++ b/library/stdarch/crates/stdsimd-verify/build.rs
@@ -3,14 +3,16 @@ use std::path::Path;
 fn main() {
     let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
     let root = dir.parent().unwrap();
-    walk(&root.join("../coresimd/x86"));
-    walk(&root.join("../coresimd/x86_64"));
-    walk(&root.join("../coresimd/arm"));
-    walk(&root.join("../coresimd/aarch64"));
+    eprintln!("root: {}", root.display());
+    walk(&root.join("core_arch/src/x86"));
+    walk(&root.join("core_arch/src/x86_64"));
+    walk(&root.join("core_arch/src/arm"));
+    walk(&root.join("core_arch/src/aarch64"));
 }
 
 fn walk(root: &Path) {
     for file in root.read_dir().unwrap() {
+        eprintln!("root: {}", root.display());
         let file = file.unwrap();
         if file.file_type().unwrap().is_dir() {
             walk(&file.path());
diff --git a/library/stdarch/crates/stdsimd-verify/src/lib.rs b/library/stdarch/crates/stdsimd-verify/src/lib.rs
index 751614bbf84..45cb5f5dbcb 100644
--- a/library/stdarch/crates/stdsimd-verify/src/lib.rs
+++ b/library/stdarch/crates/stdsimd-verify/src/lib.rs
@@ -13,12 +13,12 @@ use proc_macro::TokenStream;
 
 #[proc_macro]
 pub fn x86_functions(input: TokenStream) -> TokenStream {
-    functions(input, &["../coresimd/x86", "../coresimd/x86_64"])
+    functions(input, &["core_arch/src/x86", "core_arch/src/x86_64"])
 }
 
 #[proc_macro]
 pub fn arm_functions(input: TokenStream) -> TokenStream {
-    functions(input, &["../coresimd/arm", "../coresimd/aarch64"])
+    functions(input, &["core_arch/src/arm", "core_arch/src/aarch64"])
 }
 
 fn functions(input: TokenStream, dirs: &[&str]) -> TokenStream {
diff --git a/library/stdarch/crates/stdsimd/src/lib.rs b/library/stdarch/crates/stdsimd/src/lib.rs
deleted file mode 100644
index 594adccf05e..00000000000
--- a/library/stdarch/crates/stdsimd/src/lib.rs
+++ /dev/null
@@ -1,41 +0,0 @@
-//! SIMD and vendor intrinsics support library.
-//!
-//! This crate defines the vendor intrinsics and types primarily used for SIMD
-//! in Rust.
-
-#![feature(const_fn, integer_atomics, staged_api, stdsimd)]
-#![feature(doc_cfg, allow_internal_unstable)]
-#![cfg_attr(feature = "cargo-clippy", allow(clippy::shadow_reuse))]
-#![cfg_attr(
-    feature = "cargo-clippy",
-    deny(clippy::missing_inline_in_public_items,)
-)]
-#![cfg_attr(target_os = "linux", feature(linkage))]
-#![cfg_attr(all(target_os = "freebsd", target_arch = "aarch64"), feature(asm))]
-#![no_std]
-#![unstable(feature = "stdsimd", issue = "27731")]
-
-#[macro_use]
-extern crate cfg_if;
-extern crate coresimd;
-extern crate libc;
-extern crate std as __do_not_use_this_import;
-
-#[cfg(test)]
-#[allow(unused_imports)]
-#[macro_use(println, print)]
-extern crate std;
-
-#[path = "../../../stdsimd/mod.rs"]
-mod stdsimd;
-
-pub use stdsimd::*;
-
-#[allow(unused_imports)]
-use __do_not_use_this_import::fs;
-#[allow(unused_imports)]
-use __do_not_use_this_import::io;
-#[allow(unused_imports)]
-use __do_not_use_this_import::mem;
-#[allow(unused_imports)]
-use __do_not_use_this_import::prelude;