diff options
| author | Afonso Bordado <afonso360@users.noreply.github.com> | 2022-08-23 11:42:35 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-08-23 12:42:35 +0200 |
| commit | 48c45c481c53e615637788e3dad1b7dcb993bdc3 (patch) | |
| tree | e12e8611735ec36f9f84d3504dd9d0432ad31bc7 | |
| parent | 156bda8bc708cca60e9de18743d833c8d97dd7ff (diff) | |
| download | rust-48c45c481c53e615637788e3dad1b7dcb993bdc3.tar.gz rust-48c45c481c53e615637788e3dad1b7dcb993bdc3.zip | |
Use native scalar `fma` instruction (#1267)
Cranelift 0.87 now supports lowering `fma` as a libcall on x86 [0]. With 0.88 enabling the native x86 instruction under the `has_fma` flag. aarch64 and s390x already support this as a native instruction, so it's nice that we emit it for those. We can't lower the SIMD version using the `fma` instruction since the lowering can fail if the x86 `has_fma` flag is not enabled. Cranelift doesn't yet know how to fallback for these cases [0]: https://github.com/bytecodealliance/wasmtime/commit/709716bb8e6adaf7e65f3497168af23ce0cf09ef
| -rw-r--r-- | src/intrinsics/mod.rs | 6 | ||||
| -rw-r--r-- | src/intrinsics/simd.rs | 18 |
2 files changed, 12 insertions, 12 deletions
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs index cb620822f2d..ef3d5ccea8a 100644 --- a/src/intrinsics/mod.rs +++ b/src/intrinsics/mod.rs @@ -303,6 +303,12 @@ fn codegen_float_intrinsic_call<'tcx>( let layout = fx.layout_of(ty); let res = match intrinsic { + sym::fmaf32 | sym::fmaf64 => { + let a = args[0].load_scalar(fx); + let b = args[1].load_scalar(fx); + let c = args[2].load_scalar(fx); + CValue::by_val(fx.bcx.ins().fma(a, b, c), layout) + } sym::copysignf32 | sym::copysignf64 => { let a = args[0].load_scalar(fx); let b = args[1].load_scalar(fx); diff --git a/src/intrinsics/simd.rs b/src/intrinsics/simd.rs index c7efdb392b7..a32b413d45f 100644 --- a/src/intrinsics/simd.rs +++ b/src/intrinsics/simd.rs @@ -397,21 +397,15 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( let layout = a.layout(); let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let res_lane_layout = fx.layout_of(lane_ty); for lane in 0..lane_count { - let a_lane = a.value_lane(fx, lane); - let b_lane = b.value_lane(fx, lane); - let c_lane = c.value_lane(fx, lane); + let a_lane = a.value_lane(fx, lane).load_scalar(fx); + let b_lane = b.value_lane(fx, lane).load_scalar(fx); + let c_lane = c.value_lane(fx, lane).load_scalar(fx); - let res_lane = match lane_ty.kind() { - ty::Float(FloatTy::F32) => { - fx.easy_call("fmaf", &[a_lane, b_lane, c_lane], lane_ty) - } - ty::Float(FloatTy::F64) => { - fx.easy_call("fma", &[a_lane, b_lane, c_lane], lane_ty) - } - _ => unreachable!(), - }; + let res_lane = fx.bcx.ins().fma(a_lane, b_lane, c_lane); + let res_lane = CValue::by_val(res_lane, res_lane_layout); ret.place_lane(fx, lane).write_cvalue(fx, res_lane); } |
