Implement more SIMD

author: Antoni Boucher <bouanto@zoho.com> 2022-05-14 17:36:37 -0400
committer: Antoni Boucher <bouanto@zoho.com> 2022-06-06 22:08:07 -0400
commit: 3b3594044327ea6a426e0b95dd3ffee725089430 (patch)
tree: 1861b76af4fcd78afd957b2bf65fe4fdf69739c7
parent: e8dca3e87d164d2806098c462c6ce41301341f68 (diff)
download: rust-3b3594044327ea6a426e0b95dd3ffee725089430.tar.gz
rust-3b3594044327ea6a426e0b95dd3ffee725089430.zip
4 files changed, 111 insertions, 136 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 211d19a8dc8..0e41bec8b76 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,10 +22,10 @@ default = ["master"]
 master = ["gccjit/master"]
 
 [dependencies]
-gccjit = { git = "https://github.com/antoyo/gccjit.rs" }
+#gccjit = { git = "https://github.com/antoyo/gccjit.rs" }
 
 # Local copy.
-#gccjit = { path = "../gccjit.rs" }
+gccjit = { path = "../gccjit.rs" }
 
 target-lexicon = "0.10.0"
 
diff --git a/src/builder.rs b/src/builder.rs
index fa490fe3f22..e7adf29fed8 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -1409,7 +1409,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
     }
 
     #[cfg(not(feature="master"))]
-    pub fn vector_reduce<F>(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc>
+    pub fn vector_reduce<F>(&mut self, _src: RValue<'gcc>, _op: F) -> RValue<'gcc>
     where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
     {
         unimplemented!();
diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs
index 1b089f08f76..6b78157410b 100644
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@@ -75,38 +75,38 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
                 "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask"
                     | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask"
                     | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => {
-                        let mut new_args = args.to_vec();
-                        let arg5_type = gcc_func.get_param_type(4);
-                        let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
-                        new_args.push(minus_one);
-                        args = new_args.into();
-                    },
-                    "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
-                        let mut new_args = args.to_vec();
+                    let mut new_args = args.to_vec();
+                    let arg5_type = gcc_func.get_param_type(4);
+                    let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
+                    new_args.push(minus_one);
+                    args = new_args.into();
+                },
+                "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
+                    let mut new_args = args.to_vec();
 
-                        let mut last_arg = None;
-                        if args.len() == 4 {
-                            last_arg = new_args.pop();
-                        }
+                    let mut last_arg = None;
+                    if args.len() == 4 {
+                        last_arg = new_args.pop();
+                    }
 
-                        let arg4_type = gcc_func.get_param_type(3);
-                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
-                        new_args.push(minus_one);
+                    let arg4_type = gcc_func.get_param_type(3);
+                    let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                    new_args.push(minus_one);
 
-                        if args.len() == 3 {
-                            // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
-                            // the same GCC intrinsic, but the former has 3 parameters and the
-                            // latter has 4 so it doesn't require this additional argument.
-                            let arg5_type = gcc_func.get_param_type(4);
-                            new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
-                        }
+                    if args.len() == 3 {
+                        // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
+                        // the same GCC intrinsic, but the former has 3 parameters and the
+                        // latter has 4 so it doesn't require this additional argument.
+                        let arg5_type = gcc_func.get_param_type(4);
+                        new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
+                    }
 
-                        if let Some(last_arg) = last_arg {
-                            new_args.push(last_arg);
-                        }
+                    if let Some(last_arg) = last_arg {
+                        new_args.push(last_arg);
+                    }
 
-                        args = new_args.into();
-                    },
+                    args = new_args.into();
+                },
                     "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
                         | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
                         | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
@@ -131,6 +131,18 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
                         new_args.push(last_arg);
                         args = new_args.into();
                     },
+                    "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => {
+                        let mut new_args = args.to_vec();
+                        let last_arg = new_args.pop().expect("last arg");
+                        let arg2_type = gcc_func.get_param_type(1);
+                        let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue();
+                        new_args.push(undefined);
+                        let arg3_type = gcc_func.get_param_type(2);
+                        let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1);
+                        new_args.push(minus_one);
+                        new_args.push(last_arg);
+                        args = new_args.into();
+                    },
                     _ => (),
         }
     }
@@ -149,7 +161,8 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
             | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
             | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
             | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
-            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask"
+            | "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => {
                 if index == args_len - 1 {
                     return true;
                 }
@@ -221,6 +234,48 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
         "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask",
         "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask",
         "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask",
+        "llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask",
+        "llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask",
+        "llvm.x86.avx512.mask.cvttps2dq.256" => "__builtin_ia32_cvttps2dq256_mask",
+        "llvm.x86.avx512.mask.cvttps2dq.128" => "__builtin_ia32_cvttps2dq128_mask",
+        "llvm.x86.avx512.mask.cvttpd2dq.256" => "__builtin_ia32_cvttpd2dq256_mask",
+        "llvm.x86.avx512.mask.compress.d.512" => "__builtin_ia32_compresssi512_mask",
+        "llvm.x86.avx512.mask.compress.d.256" => "__builtin_ia32_compresssi256_mask",
+        "llvm.x86.avx512.mask.compress.d.128" => "__builtin_ia32_compresssi128_mask",
+        "llvm.x86.avx512.mask.compress.q.512" => "__builtin_ia32_compressdi512_mask",
+        "llvm.x86.avx512.mask.compress.q.256" => "__builtin_ia32_compressdi256_mask",
+        "llvm.x86.avx512.mask.compress.q.128" => "__builtin_ia32_compressdi128_mask",
+        "llvm.x86.avx512.mask.compress.ps.512" => "__builtin_ia32_compresssf512_mask",
+        "llvm.x86.avx512.mask.compress.ps.256" => "__builtin_ia32_compresssf256_mask",
+        "llvm.x86.avx512.mask.compress.ps.128" => "__builtin_ia32_compresssf128_mask",
+        "llvm.x86.avx512.mask.compress.pd.512" => "__builtin_ia32_compressdf512_mask",
+        "llvm.x86.avx512.mask.compress.pd.256" => "__builtin_ia32_compressdf256_mask",
+        "llvm.x86.avx512.mask.compress.pd.128" => "__builtin_ia32_compressdf128_mask",
+        "llvm.x86.avx512.mask.compress.store.d.512" => "",
+        "llvm.x86.avx512.mask.compress.store.d.256" => "",
+        "llvm.x86.avx512.mask.compress.store.d.128" => "",
+        "llvm.x86.avx512.mask.compress.store.q.512" => "",
+        "llvm.x86.avx512.mask.compress.store.q.256" => "",
+        "llvm.x86.avx512.mask.compress.store.q.128" => "",
+        "llvm.x86.avx512.mask.compress.store.ps.512" => "",
+        "llvm.x86.avx512.mask.compress.store.ps.256" => "",
+        "llvm.x86.avx512.mask.compress.store.ps.128" => "",
+        "llvm.x86.avx512.mask.compress.store.pd.512" => "",
+        "llvm.x86.avx512.mask.compress.store.pd.256" => "",
+        "llvm.x86.avx512.mask.compress.store.pd.128" => "",
+        "llvm.x86.avx512.mask.expand.d.512" => "",
+        "llvm.x86.avx512.mask.expand.d.256" => "",
+        "llvm.x86.avx512.mask.expand.d.128" => "",
+        "llvm.x86.avx512.mask.expand.q.512" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
+        "" => "",
 
         // The above doc points to unknown builtins for the following, so override them:
         "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",
diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs
index 870e9f776a4..a6cf99c62ff 100644
--- a/src/intrinsic/simd.rs
+++ b/src/intrinsic/simd.rs
@@ -1,5 +1,3 @@
-use std::cmp::Ordering;
-
 use gccjit::{BinaryOp, RValue, Type, ToRValue};
 use rustc_codegen_ssa::base::compare_simd_types;
 use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error};
@@ -309,117 +307,37 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
 
         enum Style {
             Float,
-            Int(/* is signed? */ bool),
+            Int,
             Unsupported,
         }
 
-        let (in_style, in_width) = match in_elem.kind() {
-            // vectors of pointer-sized integers should've been
-            // disallowed before here, so this unwrap is safe.
-            ty::Int(i) => (
-                Style::Int(true),
-                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Uint(u) => (
-                Style::Int(false),
-                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Float(f) => (Style::Float, f.bit_width()),
-            _ => (Style::Unsupported, 0),
-        };
-        let (out_style, out_width) = match out_elem.kind() {
-            ty::Int(i) => (
-                Style::Int(true),
-                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Uint(u) => (
-                Style::Int(false),
-                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Float(f) => (Style::Float, f.bit_width()),
-            _ => (Style::Unsupported, 0),
-        };
-
-        let extend = |in_type, out_type| {
-            let vector_type = bx.context.new_vector_type(out_type, 8);
-            let vector = args[0].immediate();
-            let array_type = bx.context.new_array_type(None, in_type, 8);
-            // TODO(antoyo): switch to using new_vector_access or __builtin_convertvector for vector casting.
-            let array = bx.context.new_bitcast(None, vector, array_type);
-
-            let cast_vec_element = |index| {
-                let index = bx.context.new_rvalue_from_int(bx.int_type, index);
-                bx.context.new_cast(None, bx.context.new_array_access(None, array, index).to_rvalue(), out_type)
+        let in_style =
+            match in_elem.kind() {
+                ty::Int(_) | ty::Uint(_) => Style::Int,
+                ty::Float(_) => Style::Float,
+                 _ => Style::Unsupported,
             };
 
-            bx.context.new_rvalue_from_vector(None, vector_type, &[
-                cast_vec_element(0),
-                cast_vec_element(1),
-                cast_vec_element(2),
-                cast_vec_element(3),
-                cast_vec_element(4),
-                cast_vec_element(5),
-                cast_vec_element(6),
-                cast_vec_element(7),
-            ])
-        };
+        let out_style =
+            match out_elem.kind() {
+                ty::Int(_) | ty::Uint(_) => Style::Int,
+                ty::Float(_) => Style::Float,
+                 _ => Style::Unsupported,
+            };
 
         match (in_style, out_style) {
-            (Style::Int(in_is_signed), Style::Int(_)) => {
-                return Ok(match in_width.cmp(&out_width) {
-                    Ordering::Greater => bx.trunc(args[0].immediate(), llret_ty),
-                    Ordering::Equal => args[0].immediate(),
-                    Ordering::Less => {
-                        if in_is_signed {
-                            match (in_width, out_width) {
-                                // FIXME(antoyo): the function _mm_cvtepi8_epi16 should directly
-                                // call an intrinsic equivalent to __builtin_ia32_pmovsxbw128 so that
-                                // we can generate a call to it.
-                                (8, 16) => extend(bx.i8_type, bx.i16_type),
-                                (8, 32) => extend(bx.i8_type, bx.i32_type),
-                                (8, 64) => extend(bx.i8_type, bx.i64_type),
-                                (16, 32) => extend(bx.i16_type, bx.i32_type),
-                                (32, 64) => extend(bx.i32_type, bx.i64_type),
-                                (16, 64) => extend(bx.i16_type, bx.i64_type),
-                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
-                            }
-                        } else {
-                            match (in_width, out_width) {
-                                (8, 16) => extend(bx.u8_type, bx.u16_type),
-                                (8, 32) => extend(bx.u8_type, bx.u32_type),
-                                (8, 64) => extend(bx.u8_type, bx.u64_type),
-                                (16, 32) => extend(bx.u16_type, bx.u32_type),
-                                (16, 64) => extend(bx.u16_type, bx.u64_type),
-                                (32, 64) => extend(bx.u32_type, bx.u64_type),
-                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
-                            }
-                        }
-                    }
-                });
-            }
-            (Style::Int(_), Style::Float) => {
-                // TODO: add support for internal functions in libgccjit to get access to IFN_VEC_CONVERT which is
-                // doing like __builtin_convertvector?
-                // Or maybe provide convert_vector as an API since it might not easy to get the
-                // types of internal functions.
-                unimplemented!();
-            }
-            (Style::Float, Style::Int(_)) => {
-                unimplemented!();
-            }
-            (Style::Float, Style::Float) => {
-                unimplemented!();
-            }
-            _ => { /* Unsupported. Fallthrough. */ }
+            (Style::Unsupported, Style::Unsupported) => {
+                require!(
+                    false,
+                    "unsupported cast from `{}` with element `{}` to `{}` with element `{}`",
+                    in_ty,
+                    in_elem,
+                    ret_ty,
+                    out_elem
+                );
+            },
+            _ => return Ok(bx.context.convert_vector(None, args[0].immediate(), llret_ty)),
         }
-        require!(
-            false,
-            "unsupported cast from `{}` with element `{}` to `{}` with element `{}`",
-            in_ty,
-            in_elem,
-            ret_ty,
-            out_elem
-        );
     }
 
     macro_rules! arith_binary {
@@ -590,6 +508,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
                 );
             }
         };
+        // TODO(antoyo): don't use target specific builtins here.
+        // Not sure how easy it would be to avoid theme here.
         let builtin_name =
             match (signed, is_add, in_len, elem_width) {
                 (true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned.
author	Antoni Boucher <bouanto@zoho.com>	2022-05-14 17:36:37 -0400
committer	Antoni Boucher <bouanto@zoho.com>	2022-06-06 22:08:07 -0400
commit	3b3594044327ea6a426e0b95dd3ffee725089430 (patch)
tree	1861b76af4fcd78afd957b2bf65fe4fdf69739c7
parent	e8dca3e87d164d2806098c462c6ce41301341f68 (diff)
download	rust-3b3594044327ea6a426e0b95dd3ffee725089430.tar.gz rust-3b3594044327ea6a426e0b95dd3ffee725089430.zip