From 91c842d1447a98d169c117b23916e2bbab8f925a Mon Sep 17 00:00:00 2001 From: Robrecht Blancquaert Date: Mon, 31 Oct 2022 14:02:48 +0100 Subject: [PATCH] Updated arm neon intrinsics for better speed-up --- src/backend/vector/neon/field.rs | 184 ++++++++++++++----------------- 1 file changed, 84 insertions(+), 100 deletions(-) diff --git a/src/backend/vector/neon/field.rs b/src/backend/vector/neon/field.rs index 0d4de776f..8c93afef0 100644 --- a/src/backend/vector/neon/field.rs +++ b/src/backend/vector/neon/field.rs @@ -10,9 +10,9 @@ // - Henry de Valence // - Robrecht Blancquaert -//! More details on the algorithms can be found in the `avx2` -//! module. Here comments are mostly added only when needed -//! to explain differenes between the 'base' avx2 version and +//! More details on the algorithms can be found in the `avx2` +//! module. Here comments are mostly added only when needed +//! to explain differenes between the 'base' avx2 version and //! this re-implementation for arm neon. //! The most major difference is the split of one vector of 8 @@ -21,7 +21,7 @@ //! arm instructions. use core::ops::{Add, Mul, Neg}; -use packed_simd::{u32x8, u32x4, u32x2, i32x4, u8x16, u64x4, u64x2, IntoBits}; +use packed_simd::{u32x4, u32x2, i32x4, u8x16, u64x4, u64x2, IntoBits}; use crate::backend::vector::neon::constants::{P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO}; use crate::backend::serial::u64::field::FieldElement51; @@ -55,18 +55,17 @@ fn unpack_pair(src: (u32x4, u32x4)) -> ((u32x2, u32x2), (u32x2, u32x2)) { #[inline(always)] fn repack_pair(x: (u32x4, u32x4), y: (u32x4, u32x4)) -> (u32x4, u32x4) { unsafe { - use core::arch::aarch64::vqtbl1q_u8; - use core::arch::aarch64::vorrq_u32; - - const idx_high: packed_simd::Simd<[u8; 16]> = u8x16::new( 0, 1, 2, 3, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16); - const idx_low : packed_simd::Simd<[u8; 16]> = u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 8, 9, 10, 11); - let x_shuffled: (u8x16, u8x16) = (vqtbl1q_u8(x.0.into_bits(), idx_high.into_bits()).into_bits(), - vqtbl1q_u8(x.1.into_bits(), idx_high.into_bits()).into_bits()); - let y_shuffled: (u8x16, u8x16) = (vqtbl1q_u8(y.0.into_bits(), idx_low.into_bits()).into_bits(), - vqtbl1q_u8(y.1.into_bits(), idx_low.into_bits()).into_bits()); - - return (vorrq_u32(x_shuffled.0.into_bits(), y_shuffled.0.into_bits()).into_bits(), - vorrq_u32(x_shuffled.1.into_bits(), y_shuffled.1.into_bits()).into_bits()); + use core::arch::aarch64::vget_low_u32; + use core::arch::aarch64::vcombine_u32; + use core::arch::aarch64::vset_lane_u32; + use core::arch::aarch64::vgetq_lane_u32; + + (vcombine_u32( + vset_lane_u32(vgetq_lane_u32(x.0.into_bits(), 2) , vget_low_u32(x.0.into_bits()), 1), + vset_lane_u32(vgetq_lane_u32(y.0.into_bits(), 2) , vget_low_u32(y.0.into_bits()), 1)).into_bits(), + vcombine_u32( + vset_lane_u32(vgetq_lane_u32(x.1.into_bits(), 2) , vget_low_u32(x.1.into_bits()), 1), + vset_lane_u32(vgetq_lane_u32(y.1.into_bits(), 2) , vget_low_u32(y.1.into_bits()), 1)).into_bits()) } } @@ -96,6 +95,24 @@ pub enum Shuffle { ABDC, } +macro_rules! lane_shuffle { + {$l0:expr, $l1:expr, $l2:expr, $l3:expr, $l4:expr, $l5:expr, $l6:expr, $l7:expr, $x:expr} => { + unsafe { + use core::arch::aarch64::vgetq_lane_u32; + const c: [i32; 8] = [$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7]; + (u32x4::new(if c[0] < 4 { vgetq_lane_u32($x.0.into_bits(), c[0]) } else { vgetq_lane_u32($x.1.into_bits(), c[0] - 4) }, + if c[1] < 4 { vgetq_lane_u32($x.0.into_bits(), c[1]) } else { vgetq_lane_u32($x.1.into_bits(), c[1] - 4) }, + if c[2] < 4 { vgetq_lane_u32($x.0.into_bits(), c[2]) } else { vgetq_lane_u32($x.1.into_bits(), c[2] - 4) }, + if c[3] < 4 { vgetq_lane_u32($x.0.into_bits(), c[3]) } else { vgetq_lane_u32($x.1.into_bits(), c[3] - 4) }), + u32x4::new(if c[4] < 4 { vgetq_lane_u32($x.0.into_bits(), c[4]) } else { vgetq_lane_u32($x.1.into_bits(), c[4] - 4) }, + if c[5] < 4 { vgetq_lane_u32($x.0.into_bits(), c[5]) } else { vgetq_lane_u32($x.1.into_bits(), c[5] - 4) }, + if c[6] < 4 { vgetq_lane_u32($x.0.into_bits(), c[6]) } else { vgetq_lane_u32($x.1.into_bits(), c[6] - 4) }, + if c[7] < 4 { vgetq_lane_u32($x.0.into_bits(), c[7]) } else { vgetq_lane_u32($x.1.into_bits(), c[7] - 4) })) + } + + } +} + #[derive(Clone, Copy, Debug)] pub struct FieldElement2625x4(pub(crate) [(u32x4, u32x4); 5]); @@ -144,14 +161,14 @@ impl FieldElement2625x4 { pub fn split(&self) -> [FieldElement51; 4] { let mut out = [FieldElement51::zero(); 4]; for i in 0..5 { - let a_2i = self.0[i].0.extract(0) as u64; - let b_2i = self.0[i].0.extract(1) as u64; - let a_2i_1 = self.0[i].0.extract(2) as u64; + let a_2i = self.0[i].0.extract(0) as u64; + let b_2i = self.0[i].0.extract(1) as u64; + let a_2i_1 = self.0[i].0.extract(2) as u64; let b_2i_1 = self.0[i].0.extract(3) as u64; let c_2i = self.0[i].1.extract(0) as u64; - let d_2i = self.0[i].1.extract(1) as u64; - let c_2i_1 = self.0[i].1.extract(2) as u64; - let d_2i_1 = self.0[i].1.extract(3) as u64; + let d_2i = self.0[i].1.extract(1) as u64; + let c_2i_1 = self.0[i].1.extract(2) as u64; + let d_2i_1 = self.0[i].1.extract(3) as u64; out[0].0[i] = a_2i + (a_2i_1 << 26); out[1].0[i] = b_2i + (b_2i_1 << 26); @@ -162,64 +179,21 @@ impl FieldElement2625x4 { out } - /// Used double vqtbx1q_u8 intructions, so 4 instead of normally needed 2, - /// because of the need to put limbs from the second vector of limbs into - /// the first vector and vice versa. #[inline] pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 { #[inline(always)] fn shuffle_lanes(x: (u32x4, u32x4), control: Shuffle) -> (u32x4, u32x4) { - unsafe { - use core::arch::aarch64::vqtbx1q_u8; - - let c: (u8x16, u8x16, u8x16, u8x16) = match control { - Shuffle::BADC => (u8x16::new( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11), // Reorder first vector - u8x16::new( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11), // Reorder second vector - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), // Not used - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)),// Not used - Shuffle::ABAB => (u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), // Take first vector - u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16), // Ignore second vector, take reordered first vector - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), // Not used - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)),// Take first vector - Shuffle::AAAA => (u8x16::new( 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11), // p(v1) || (p(v2) || p(0)) <\ p(x) = permute(x) - u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16), // p(v2) || (p(v1) || p(0)) | <\ - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), // p(v2) _/ | - u8x16::new( 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11)),// p(v1) _/ - Shuffle::BBBB => (u8x16::new( 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15), - u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15)), - Shuffle::DBBD => (u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15), - u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15), - u8x16::new( 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15), - u8x16::new( 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15)), - Shuffle::CACA => (u8x16::new(16, 16, 16, 16, 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11), - u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11)), - Shuffle::BACD => (u8x16::new( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)), - Shuffle::ABDC => (u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)), - Shuffle::ADDA => (u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16), - u8x16::new( 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15, 16, 16, 16, 16), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11)), - Shuffle::CBCB => (u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)) - }; - (vqtbx1q_u8(vqtbx1q_u8(u32x4::splat(0).into_bits(), x.1.into_bits(), c.2.into_bits()), - x.0.into_bits(), - c.0.into_bits()).into_bits(), - vqtbx1q_u8(vqtbx1q_u8(u32x4::splat(0).into_bits(), x.0.into_bits(), c.3.into_bits()), - x.1.into_bits(), - c.1.into_bits()).into_bits()) + match control { + Shuffle::AAAA => lane_shuffle!(0, 0, 2, 2, 0, 0, 2, 2, x), + Shuffle::BBBB => lane_shuffle!(1, 1, 3, 3, 1, 1, 3, 3, x), + Shuffle::CACA => lane_shuffle!(4, 0, 6, 2, 4, 0, 6, 2, x), + Shuffle::DBBD => lane_shuffle!(5, 1, 7, 3, 1, 5, 3, 7, x), + Shuffle::ADDA => lane_shuffle!(0, 5, 2, 7, 5, 0, 7, 2, x), + Shuffle::CBCB => lane_shuffle!(4, 1, 6, 3, 4, 1, 6, 3, x), + Shuffle::ABAB => lane_shuffle!(0, 1, 2, 3, 0, 1, 2, 3, x), + Shuffle::BADC => lane_shuffle!(1, 0, 3, 2, 5, 4, 7, 6, x), + Shuffle::BACD => lane_shuffle!(1, 0, 3, 2, 4, 5, 6, 7, x), + Shuffle::ABDC => lane_shuffle!(0, 1, 2, 3, 5, 4, 7, 6, x), } } @@ -232,6 +206,7 @@ impl FieldElement2625x4 { ]) } + // Can probably be sped up using multiple vset/vget instead of table #[inline] pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 { #[inline(always)] @@ -240,11 +215,11 @@ impl FieldElement2625x4 { use core::arch::aarch64::vqtbx1q_u8; match control { Lanes::C => { - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16).into_bits()).into_bits(), + (x.0, vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) } Lanes::D => { - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16).into_bits()).into_bits(), + (x.0, vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits()).into_bits()) } Lanes::AD => { @@ -259,7 +234,7 @@ impl FieldElement2625x4 { vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) } Lanes::CD => { - (x.0, y.1) + (x.0, y.1) } Lanes::BC => { (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits() ).into_bits(), @@ -354,20 +329,25 @@ impl FieldElement2625x4 { let rotated_carryout = |v: (u32x4, u32x4)| -> (u32x4, u32x4) { unsafe { use core::arch::aarch64::vqshlq_u32; - use core::arch::aarch64::vqtbl1q_u8; + use core::arch::aarch64::vget_low_u32; + use core::arch::aarch64::vget_high_u32; + use core::arch::aarch64::vcombine_u32; - let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), vqshlq_u32(v.1.into_bits(), shifts.1.into_bits()).into_bits()); - (vqtbl1q_u8(c.0.into_bits(), u8x16::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).into_bits()).into_bits(), - vqtbl1q_u8(c.1.into_bits(), u8x16::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).into_bits()).into_bits()) + let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), + vqshlq_u32(v.1.into_bits(), shifts.1.into_bits()).into_bits()); + (vcombine_u32(vget_high_u32(c.0.into_bits()), vget_low_u32(c.0.into_bits())).into_bits(), + vcombine_u32(vget_high_u32(c.1.into_bits()), vget_low_u32(c.1.into_bits())).into_bits()) } }; let combine = |v_lo: (u32x4, u32x4), v_hi: (u32x4, u32x4)| -> (u32x4, u32x4) { unsafe { - use core::arch::aarch64::vqtbx1q_u8; - (vqtbx1q_u8(v_lo.0.into_bits(), v_hi.0.into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 8, 9, 10, 11, 12, 13, 14, 15).into_bits()).into_bits(), - vqtbx1q_u8(v_lo.1.into_bits(), v_hi.1.into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 8, 9, 10, 11, 12, 13, 14, 15).into_bits()).into_bits()) + use core::arch::aarch64::vget_low_u32; + use core::arch::aarch64::vget_high_u32; + use core::arch::aarch64::vcombine_u32; + (vcombine_u32(vget_low_u32(v_lo.0.into_bits()), vget_high_u32(v_hi.0.into_bits())).into_bits(), + vcombine_u32(vget_low_u32(v_lo.1.into_bits()), vget_high_u32(v_hi.1.into_bits())).into_bits()) } }; @@ -395,12 +375,13 @@ impl FieldElement2625x4 { let c9_19: (u32x4, u32x4) = unsafe { use core::arch::aarch64::vmulq_n_u32; - use core::arch::aarch64::vqtbl1q_u8; - + use core::arch::aarch64::vget_low_u32; + use core::arch::aarch64::vcombine_u32; + let c9_19_spread: (u32x4, u32x4) = (vmulq_n_u32(c98.0.into_bits(), 19).into_bits(), vmulq_n_u32(c98.1.into_bits(), 19).into_bits()); - (vqtbl1q_u8(c9_19_spread.0.into_bits(), u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16).into_bits()).into_bits(), - vqtbl1q_u8(c9_19_spread.1.into_bits(), u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16).into_bits()).into_bits()) + (vcombine_u32(vget_low_u32(c9_19_spread.0.into_bits()), u32x2::splat(0).into_bits()).into_bits(), + vcombine_u32(vget_low_u32(c9_19_spread.1.into_bits()), u32x2::splat(0).into_bits()).into_bits()) }; v[0] = (v[0].0 + c9_19.0, v[0].1 + c9_19.1); @@ -442,9 +423,9 @@ impl FieldElement2625x4 { unsafe { use core::arch::aarch64::vmulq_n_u32; - c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), + c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), vmulq_n_u32(c0.1.into_bits(), 19).into_bits()); - c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), + c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), vmulq_n_u32(c1.1.into_bits(), 19).into_bits()); } @@ -476,8 +457,8 @@ impl FieldElement2625x4 { #[inline(always)] fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) { use core::arch::aarch64::vmull_u32; - unsafe { - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), + unsafe { + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits()); (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2))) } @@ -516,7 +497,7 @@ impl FieldElement2625x4 { let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1); let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1); let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5); - + let low__p37 = u64x4::splat(0x3ffffed << 37); let even_p37 = u64x4::splat(0x3ffffff << 37); @@ -524,13 +505,16 @@ impl FieldElement2625x4 { let negate_D = |x_01: u64x4, p_01: u64x4| -> (u64x2, u64x2) { unsafe { - use core::arch::aarch64::vqtbx1q_u8; + use core::arch::aarch64::vget_low_u32; + use core::arch::aarch64::vget_high_u32; + use core::arch::aarch64::vcombine_u32; let x = (u64x2::new(x_01.extract(0), x_01.extract(1)), u64x2::new(x_01.extract(2), x_01.extract(3))); let p = (u64x2::new(p_01.extract(0), p_01.extract(1)), u64x2::new(p_01.extract(2), p_01.extract(3))); - (vqtbx1q_u8(x.0.into_bits(), (p.0 - x.0).into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16).into_bits()).into_bits(), - vqtbx1q_u8(x.1.into_bits(), (p.1 - x.1).into_bits(), u8x16::new(16, 16, 16, 16, 16, 16, 16, 16, 8, 9, 10, 11, 12, 13, 14, 15).into_bits()).into_bits()) + (x.0.into_bits(), + vcombine_u32(vget_low_u32(x.1.into_bits()), + vget_high_u32((p.1 - x.1).into_bits())).into_bits()) } }; @@ -625,8 +609,8 @@ impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 { #[inline(always)] fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) { use core::arch::aarch64::vmull_u32; - unsafe { - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), + unsafe { + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits()); (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2))) }