about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAntoni Boucher <bouanto@zoho.com>2022-05-03 22:35:26 -0400
committerAntoni Boucher <bouanto@zoho.com>2022-05-03 22:35:26 -0400
commit4b40ac790da69d99028324540fa84404096f04ab (patch)
tree453d0f06132d0731dde0caad3d4781b550ba4800
parenteba654c57ae2fe29ae963b128e5bf57ac2d08a6c (diff)
downloadrust-4b40ac790da69d99028324540fa84404096f04ab.tar.gz
rust-4b40ac790da69d99028324540fa84404096f04ab.zip
Support more SIMD intrinsics and refactor argument adjustment
-rw-r--r--src/builder.rs113
-rw-r--r--src/intrinsic/llvm.rs207
2 files changed, 194 insertions, 126 deletions
diff --git a/src/builder.rs b/src/builder.rs
index 82b0e64e582..df5c29f625e 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -48,6 +48,7 @@ use rustc_target::spec::{HasTargetSpec, Target};
 
 use crate::common::{SignType, TypeReflection, type_is_pointer};
 use crate::context::CodegenCx;
+use crate::intrinsic::llvm;
 use crate::type_of::LayoutGccExt;
 
 // TODO(antoyo)
@@ -224,18 +225,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
             .zip(args.iter())
             .enumerate()
             .map(|(index, (expected_ty, &actual_val))| {
-                // NOTE: these intrinsics have missing parameters before the last one, so ignore the
-                // last argument type check.
-                // FIXME(antoyo): find a way to refactor in order to avoid this hack.
-                match &*func_name {
-                    "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
-                    | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask"
-                    | "__builtin_ia32_sqrtpd512_mask" => {
-                        if index == args.len() - 1 {
-                            return actual_val;
-                        }
-                    },
-                    _ => (),
+                if llvm::ignore_arg_cast(&func_name, index, args.len()) {
+                    return actual_val;
                 }
 
                 let actual_ty = actual_val.get_type();
@@ -302,7 +293,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
     }
 
     fn function_ptr_call(&mut self, func_ptr: RValue<'gcc>, args: &[RValue<'gcc>], _funclet: Option<&Funclet>) -> RValue<'gcc> {
-        let mut args = self.check_ptr_call("call", func_ptr, args);
+        let args = self.check_ptr_call("call", func_ptr, args);
 
         // gccjit requires to use the result of functions, even when it's not used.
         // That's why we assign the result to a local or call add_eval().
@@ -314,92 +305,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
         if return_type != void_type {
             unsafe { RETURN_VALUE_COUNT += 1 };
             let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
-            // Some LLVM intrinsics do not map 1-to-1 to GCC intrinsics, so we add the missing
-            // arguments here.
-            if gcc_func.get_param_count() != args.len() {
-                let func_name = format!("{:?}", func_ptr);
-                match &*func_name {
-                    "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask"
-                    // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs.
-                    | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask"
-                    | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
-                    | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask"
-                    | "__builtin_ia32_pmaxuq128_mask"
-                    | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask"
-                    | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
-                    | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask"
-                    | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask"
-                    => {
-                        // TODO: refactor by separating those intrinsics outside of this branch.
-                        let add_before_last_arg =
-                            match &*func_name {
-                                "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
-                                | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
-                                | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true,
-                                _ => false,
-                            };
-                        let new_first_arg_is_zero =
-                            match &*func_name {
-                                "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask"
-                                | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true,
-                                _ => false
-                            };
-                        let arg3_index =
-                            match &*func_name {
-                                "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1,
-                                _ => 2,
-                            };
-                        let mut new_args = args.to_vec();
-                        let arg3_type = gcc_func.get_param_type(arg3_index);
-                        let first_arg =
-                            if new_first_arg_is_zero {
-                                let vector_type = arg3_type.dyncast_vector().expect("vector type");
-                                let zero = self.context.new_rvalue_zero(vector_type.get_element_type());
-                                let num_units = vector_type.get_num_units();
-                                self.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units])
-                            }
-                            else {
-                                self.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue()
-                            };
-                        if add_before_last_arg {
-                            new_args.insert(new_args.len() - 1, first_arg);
-                        }
-                        else {
-                            new_args.push(first_arg);
-                        }
-                        let arg4_index =
-                            match &*func_name {
-                                "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2,
-                                _ => 3,
-                            };
-                        let arg4_type = gcc_func.get_param_type(arg4_index);
-                        let minus_one = self.context.new_rvalue_from_int(arg4_type, -1);
-                        if add_before_last_arg {
-                            new_args.insert(new_args.len() - 1, minus_one);
-                        }
-                        else {
-                            new_args.push(minus_one);
-                        }
-                        args = new_args.into();
-                    },
-                    "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
-                        let mut new_args = args.to_vec();
-                        if args.len() == 3 {
-                            // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmaddsub.ps.512 maps to
-                            // the same GCC intrinsic, but the former has 3 parameters and the
-                            // latter has 4 so it doesn't require this additional argument.
-                            let arg4_type = gcc_func.get_param_type(3);
-                            let minus_one = self.context.new_rvalue_from_int(arg4_type, -1);
-                            new_args.push(minus_one);
-                        }
-
-                        let arg5_type = gcc_func.get_param_type(4);
-                        new_args.push(self.context.new_rvalue_from_int(arg5_type, 4));
-                        args = new_args.into();
-                    },
-                    _ => (),
-                }
-            }
+            let func_name = format!("{:?}", func_ptr);
+            let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name);
             self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
             result.to_rvalue()
         }
@@ -1514,11 +1421,11 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
         self.vector_reduce(src, |a, b, context| context.new_binary_op(None, op, a.get_type(), a, b))
     }
 
-    pub fn vector_reduce_fadd_fast(&mut self, acc: RValue<'gcc>, src: RValue<'gcc>) -> RValue<'gcc> {
+    pub fn vector_reduce_fadd_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
         unimplemented!();
     }
 
-    pub fn vector_reduce_fmul_fast(&mut self, acc: RValue<'gcc>, src: RValue<'gcc>) -> RValue<'gcc> {
+    pub fn vector_reduce_fmul_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
         unimplemented!();
     }
 
@@ -1553,6 +1460,10 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
         let ones = vec![self.context.new_rvalue_one(element_type); num_units];
         let ones = self.context.new_rvalue_from_vector(None, cond_type, &ones);
         let inverted_masks = masks + ones;
+        // NOTE: sometimes, the type of else_val can be different than the type of then_val in
+        // libgccjit (vector of int vs vector of int32_t), but they should be the same for the AND
+        // operation to work.
+        let else_val = self.context.new_bitcast(None, else_val, then_val.get_type());
         let else_vals = inverted_masks & else_val;
 
         then_vals | else_vals
diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs
index 5c836736083..1175ea00547 100644
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@@ -1,6 +1,169 @@
-use gccjit::Function;
+use std::borrow::Cow;
 
-use crate::context::CodegenCx;
+use gccjit::{Function, FunctionPtrType, RValue, ToRValue};
+
+use crate::{context::CodegenCx, builder::Builder};
+
+pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, gcc_func: FunctionPtrType<'gcc>, mut args: Cow<'b, [RValue<'gcc>]>, func_name: &str) -> Cow<'b, [RValue<'gcc>]> {
+    // Some LLVM intrinsics do not map 1-to-1 to GCC intrinsics, so we add the missing
+    // arguments here.
+    if gcc_func.get_param_count() != args.len() {
+        match &*func_name {
+            "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask"
+                // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs.
+                | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask"
+                | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+                | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask"
+                | "__builtin_ia32_pmaxuq128_mask"
+                | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask"
+                | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
+                | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask"
+                | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask"
+                => {
+                    // TODO: refactor by separating those intrinsics outside of this branch.
+                    let add_before_last_arg =
+                        match &*func_name {
+                            "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+                                | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
+                                | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true,
+                            _ => false,
+                        };
+                    let new_first_arg_is_zero =
+                        match &*func_name {
+                            "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask"
+                                | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true,
+                            _ => false
+                        };
+                    let arg3_index =
+                        match &*func_name {
+                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1,
+                            _ => 2,
+                        };
+                    let mut new_args = args.to_vec();
+                    let arg3_type = gcc_func.get_param_type(arg3_index);
+                    let first_arg =
+                        if new_first_arg_is_zero {
+                            let vector_type = arg3_type.dyncast_vector().expect("vector type");
+                            let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                            let num_units = vector_type.get_num_units();
+                            builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units])
+                        }
+                        else {
+                            builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue()
+                        };
+                    if add_before_last_arg {
+                        new_args.insert(new_args.len() - 1, first_arg);
+                    }
+                    else {
+                        new_args.push(first_arg);
+                    }
+                    let arg4_index =
+                        match &*func_name {
+                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2,
+                            _ => 3,
+                        };
+                    let arg4_type = gcc_func.get_param_type(arg4_index);
+                    let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                    if add_before_last_arg {
+                        new_args.insert(new_args.len() - 1, minus_one);
+                    }
+                    else {
+                        new_args.push(minus_one);
+                    }
+                    args = new_args.into();
+                },
+                "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask"
+                    | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask"
+                    | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => {
+                        let mut new_args = args.to_vec();
+                        let arg5_type = gcc_func.get_param_type(4);
+                        let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
+                        new_args.push(minus_one);
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
+                        let mut new_args = args.to_vec();
+
+                        let mut last_arg = None;
+                        if args.len() == 4 {
+                            last_arg = new_args.pop();
+                        }
+
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+
+                        if args.len() == 3 {
+                            // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
+                            // the same GCC intrinsic, but the former has 3 parameters and the
+                            // latter has 4 so it doesn't require this additional argument.
+                            let arg5_type = gcc_func.get_param_type(4);
+                            new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
+                        }
+
+                        if let Some(last_arg) = last_arg {
+                            new_args.push(last_arg);
+                        }
+
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
+                        | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
+                        | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
+                        | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+                        let mut new_args = args.to_vec();
+                        let last_arg = new_args.pop().expect("last arg");
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+                        new_args.push(last_arg);
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" => {
+                        let mut new_args = args.to_vec();
+                        let last_arg = new_args.pop().expect("last arg");
+                        let arg3_type = gcc_func.get_param_type(2);
+                        let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue();
+                        new_args.push(undefined);
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+                        new_args.push(last_arg);
+                        args = new_args.into();
+                    },
+                    _ => (),
+        }
+    }
+
+    args
+}
+
+pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
+    // NOTE: these intrinsics have missing parameters before the last one, so ignore the
+    // last argument type check.
+    // FIXME(antoyo): find a way to refactor in order to avoid this hack.
+    match func_name {
+        "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+            | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask"
+            | "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
+            | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
+            | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
+            | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
+            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+                if index == args_len - 1 {
+                    return true;
+                }
+            },
+        "__builtin_ia32_vfmaddps512_mask" => {
+            if args_len == 4 && index == args_len - 1 {
+                return true;
+            }
+        },
+        _ => (),
+    }
+
+    false
+}
 
 #[cfg(not(feature="master"))]
 pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> {
@@ -37,29 +200,23 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
         "llvm.x86.avx512.mask.pminu.q.128" => "__builtin_ia32_pminuq128_mask",
         "llvm.fma.v16f32" => "__builtin_ia32_vfmaddps512_mask",
         "llvm.fma.v8f64" => "__builtin_ia32_vfmaddpd512_mask",
-        "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddps512_mask",
-        "llvm.x86.avx512.vfmaddsub.pd.512" => "__builtin_ia32_vfmaddpd512_mask",
-        "llvm.x86.avx512.rcp14.ps.256" => "__builtin_ia32_rcp14ps256_mask",
-        "llvm.x86.avx512.rcp14.ps.128" => "__builtin_ia32_rcp14ps128_mask",
-        "llvm.x86.avx512.rcp14.pd.256" => "__builtin_ia32_rcp14pd256_mask",
-        "llvm.x86.avx512.rcp14.pd.128" => "__builtin_ia32_rcp14pd128_mask",
-        "llvm.x86.avx512.rsqrt14.ps.256" => "__builtin_ia32_rsqrt14ps256_mask",
-        "llvm.x86.avx512.rsqrt14.ps.128" => "__builtin_ia32_rsqrt14ps128_mask",
-        "llvm.x86.avx512.rsqrt14.pd.256" => "__builtin_ia32_rsqrt14pd256_mask",
-        "llvm.x86.avx512.rsqrt14.pd.128" => "__builtin_ia32_rsqrt14pd128_mask",
-        "llvm.x86.avx512.mask.getexp.ps.512" => "__builtin_ia32_getexpps512_mask",
-        "llvm.x86.avx512.mask.getexp.ps.256" => "__builtin_ia32_getexpps256_mask",
-        "llvm.x86.avx512.mask.getexp.ps.128" => "__builtin_ia32_getexpps128_mask",
-        "llvm.x86.avx512.mask.getexp.pd.512" => "__builtin_ia32_getexppd512_mask",
-        "llvm.x86.avx512.mask.getexp.pd.256" => "__builtin_ia32_getexppd256_mask",
-        "llvm.x86.avx512.mask.getexp.pd.128" => "__builtin_ia32_getexppd128_mask",
-        "llvm.x86.avx512.mask.rndscale.ps.256" => "__builtin_ia32_rndscaleps_256_mask",
-        "llvm.x86.avx512.mask.rndscale.ps.128" => "__builtin_ia32_rndscaleps_128_mask",
-        "llvm.x86.avx512.mask.rndscale.pd.256" => "__builtin_ia32_rndscalepd_256_mask",
-        "llvm.x86.avx512.mask.rndscale.pd.128" => "__builtin_ia32_rndscalepd_128_mask",
-        "llvm.x86.avx512.mask.scalef.ps.512" => "__builtin_ia32_scalefps512_mask",
-        "llvm.x86.avx512.mask.scalef.ps.256" => "__builtin_ia32_scalefps256_mask",
-        "llvm.x86.avx512.mask.scalef.ps.128" => "__builtin_ia32_scalefps128_mask",
+        "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddsubps512_mask",
+        "llvm.x86.avx512.vfmaddsub.pd.512" => "__builtin_ia32_vfmaddsubpd512_mask",
+        "llvm.x86.avx512.pternlog.d.512" => "__builtin_ia32_pternlogd512_mask",
+        "llvm.x86.avx512.pternlog.d.256" => "__builtin_ia32_pternlogd256_mask",
+        "llvm.x86.avx512.pternlog.d.128" => "__builtin_ia32_pternlogd128_mask",
+        "llvm.x86.avx512.pternlog.q.512" => "__builtin_ia32_pternlogq512_mask",
+        "llvm.x86.avx512.pternlog.q.256" => "__builtin_ia32_pternlogq256_mask",
+        "llvm.x86.avx512.pternlog.q.128" => "__builtin_ia32_pternlogq128_mask",
+        "llvm.x86.avx512.add.ps.512" => "__builtin_ia32_addps512_mask",
+        "llvm.x86.avx512.add.pd.512" => "__builtin_ia32_addpd512_mask",
+        "llvm.x86.avx512.sub.ps.512" => "__builtin_ia32_subps512_mask",
+        "llvm.x86.avx512.sub.pd.512" => "__builtin_ia32_subpd512_mask",
+        "llvm.x86.avx512.mul.ps.512" => "__builtin_ia32_mulps512_mask",
+        "llvm.x86.avx512.mul.pd.512" => "__builtin_ia32_mulpd512_mask",
+        "llvm.x86.avx512.div.ps.512" => "__builtin_ia32_divps512_mask",
+        "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask",
+        "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask",
 
         // The above doc points to unknown builtins for the following, so override them:
         "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",