From 26c494fcd35c56b11b26edf544e45d0a97c8102a Mon Sep 17 00:00:00 2001 From: Ruben De Smet Date: Wed, 7 Dec 2022 15:17:21 +0100 Subject: [PATCH] Use packed_simd::shuffle instead of vqtbx1q_u8 --- src/backend/vector/neon/field.rs | 75 +++++++++++++++----------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/src/backend/vector/neon/field.rs b/src/backend/vector/neon/field.rs index 322d7734e..07ec72c82 100644 --- a/src/backend/vector/neon/field.rs +++ b/src/backend/vector/neon/field.rs @@ -10,9 +10,9 @@ // - Henry de Valence // - Robrecht Blancquaert -//! More details on the algorithms can be found in the `avx2` -//! module. Here comments are mostly added only when needed -//! to explain differenes between the 'base' avx2 version and +//! More details on the algorithms can be found in the `avx2` +//! module. Here comments are mostly added only when needed +//! to explain differenes between the 'base' avx2 version and //! this re-implementation for arm neon. //! The most major difference is the split of one vector of 8 @@ -61,10 +61,10 @@ fn repack_pair(x: (u32x4, u32x4), y: (u32x4, u32x4)) -> (u32x4, u32x4) { use core::arch::aarch64::vgetq_lane_u32; (vcombine_u32( - vset_lane_u32(vgetq_lane_u32(x.0.into_bits(), 2) , vget_low_u32(x.0.into_bits()), 1), - vset_lane_u32(vgetq_lane_u32(y.0.into_bits(), 2) , vget_low_u32(y.0.into_bits()), 1)).into_bits(), + vset_lane_u32(vgetq_lane_u32(x.0.into_bits(), 2) , vget_low_u32(x.0.into_bits()), 1), + vset_lane_u32(vgetq_lane_u32(y.0.into_bits(), 2) , vget_low_u32(y.0.into_bits()), 1)).into_bits(), vcombine_u32( - vset_lane_u32(vgetq_lane_u32(x.1.into_bits(), 2) , vget_low_u32(x.1.into_bits()), 1), + vset_lane_u32(vgetq_lane_u32(x.1.into_bits(), 2) , vget_low_u32(x.1.into_bits()), 1), vset_lane_u32(vgetq_lane_u32(y.1.into_bits(), 2) , vget_low_u32(y.1.into_bits()), 1)).into_bits()) } } @@ -100,16 +100,16 @@ macro_rules! lane_shuffle { unsafe { use core::arch::aarch64::vgetq_lane_u32; const c: [i32; 8] = [$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7]; - (u32x4::new(if c[0] < 4 { vgetq_lane_u32($x.0.into_bits(), c[0]) } else { vgetq_lane_u32($x.1.into_bits(), c[0] - 4) }, - if c[1] < 4 { vgetq_lane_u32($x.0.into_bits(), c[1]) } else { vgetq_lane_u32($x.1.into_bits(), c[1] - 4) }, - if c[2] < 4 { vgetq_lane_u32($x.0.into_bits(), c[2]) } else { vgetq_lane_u32($x.1.into_bits(), c[2] - 4) }, + (u32x4::new(if c[0] < 4 { vgetq_lane_u32($x.0.into_bits(), c[0]) } else { vgetq_lane_u32($x.1.into_bits(), c[0] - 4) }, + if c[1] < 4 { vgetq_lane_u32($x.0.into_bits(), c[1]) } else { vgetq_lane_u32($x.1.into_bits(), c[1] - 4) }, + if c[2] < 4 { vgetq_lane_u32($x.0.into_bits(), c[2]) } else { vgetq_lane_u32($x.1.into_bits(), c[2] - 4) }, if c[3] < 4 { vgetq_lane_u32($x.0.into_bits(), c[3]) } else { vgetq_lane_u32($x.1.into_bits(), c[3] - 4) }), - u32x4::new(if c[4] < 4 { vgetq_lane_u32($x.0.into_bits(), c[4]) } else { vgetq_lane_u32($x.1.into_bits(), c[4] - 4) }, - if c[5] < 4 { vgetq_lane_u32($x.0.into_bits(), c[5]) } else { vgetq_lane_u32($x.1.into_bits(), c[5] - 4) }, - if c[6] < 4 { vgetq_lane_u32($x.0.into_bits(), c[6]) } else { vgetq_lane_u32($x.1.into_bits(), c[6] - 4) }, + u32x4::new(if c[4] < 4 { vgetq_lane_u32($x.0.into_bits(), c[4]) } else { vgetq_lane_u32($x.1.into_bits(), c[4] - 4) }, + if c[5] < 4 { vgetq_lane_u32($x.0.into_bits(), c[5]) } else { vgetq_lane_u32($x.1.into_bits(), c[5] - 4) }, + if c[6] < 4 { vgetq_lane_u32($x.0.into_bits(), c[6]) } else { vgetq_lane_u32($x.1.into_bits(), c[6] - 4) }, if c[7] < 4 { vgetq_lane_u32($x.0.into_bits(), c[7]) } else { vgetq_lane_u32($x.1.into_bits(), c[7] - 4) })) } - + } } @@ -161,14 +161,14 @@ impl FieldElement2625x4 { pub fn split(&self) -> [FieldElement51; 4] { let mut out = [FieldElement51::zero(); 4]; for i in 0..5 { - let a_2i = self.0[i].0.extract(0) as u64; - let b_2i = self.0[i].0.extract(1) as u64; - let a_2i_1 = self.0[i].0.extract(2) as u64; + let a_2i = self.0[i].0.extract(0) as u64; + let b_2i = self.0[i].0.extract(1) as u64; + let a_2i_1 = self.0[i].0.extract(2) as u64; let b_2i_1 = self.0[i].0.extract(3) as u64; let c_2i = self.0[i].1.extract(0) as u64; - let d_2i = self.0[i].1.extract(1) as u64; - let c_2i_1 = self.0[i].1.extract(2) as u64; - let d_2i_1 = self.0[i].1.extract(3) as u64; + let d_2i = self.0[i].1.extract(1) as u64; + let c_2i_1 = self.0[i].1.extract(2) as u64; + let d_2i_1 = self.0[i].1.extract(3) as u64; out[0].0[i] = a_2i + (a_2i_1 << 26); out[1].0[i] = b_2i + (b_2i_1 << 26); @@ -212,33 +212,28 @@ impl FieldElement2625x4 { #[inline(always)] fn blend_lanes(x: (u32x4, u32x4), y: (u32x4, u32x4), control: Lanes) -> (u32x4, u32x4) { unsafe { - use core::arch::aarch64::vqtbx1q_u8; + use packed_simd::shuffle; match control { Lanes::C => { - (x.0, - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) + (x.0, shuffle!(y.1, x.1, [0, 5, 2, 7])) } Lanes::D => { - (x.0, - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits()).into_bits()) + (x.0, shuffle!(y.1, x.1, [4, 1, 6, 3])) } Lanes::AD => { - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits() ).into_bits(), - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits() ).into_bits()) + (shuffle!(y.0, x.0, [0, 5, 2, 7]), shuffle!(y.1, x.1, [4, 1, 6, 3])) } Lanes::AB => { (y.0, x.1) } Lanes::AC => { - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits(), - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) + (shuffle!(y.0, x.0, [0, 5, 2, 7]), shuffle!(y.1, x.1, [0, 5, 2, 7])) } Lanes::CD => { - (x.0, y.1) + (x.0, y.1) } Lanes::BC => { - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits() ).into_bits(), - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits() ).into_bits()) + (shuffle!(y.0, x.0, [4, 1, 6, 3]), shuffle!(y.1, x.1, [0, 5, 2, 7])) } Lanes::ABCD => { y @@ -333,7 +328,7 @@ impl FieldElement2625x4 { use core::arch::aarch64::vget_high_u32; use core::arch::aarch64::vcombine_u32; - let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), + let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), vqshlq_u32(v.1.into_bits(), shifts.1.into_bits()).into_bits()); (vcombine_u32(vget_high_u32(c.0.into_bits()), vget_low_u32(c.0.into_bits())).into_bits(), vcombine_u32(vget_high_u32(c.1.into_bits()), vget_low_u32(c.1.into_bits())).into_bits()) @@ -377,7 +372,7 @@ impl FieldElement2625x4 { use core::arch::aarch64::vmulq_n_u32; use core::arch::aarch64::vget_low_u32; use core::arch::aarch64::vcombine_u32; - + let c9_19_spread: (u32x4, u32x4) = (vmulq_n_u32(c98.0.into_bits(), 19).into_bits(), vmulq_n_u32(c98.1.into_bits(), 19).into_bits()); (vcombine_u32(vget_low_u32(c9_19_spread.0.into_bits()), u32x2::splat(0).into_bits()).into_bits(), @@ -423,9 +418,9 @@ impl FieldElement2625x4 { unsafe { use core::arch::aarch64::vmulq_n_u32; - c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), + c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), vmulq_n_u32(c0.1.into_bits(), 19).into_bits()); - c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), + c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), vmulq_n_u32(c1.1.into_bits(), 19).into_bits()); } @@ -457,8 +452,8 @@ impl FieldElement2625x4 { #[inline(always)] fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) { use core::arch::aarch64::vmull_u32; - unsafe { - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), + unsafe { + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits()); (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2))) } @@ -497,7 +492,7 @@ impl FieldElement2625x4 { let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1); let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1); let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5); - + let low__p37 = u64x4::splat(0x3ffffed << 37); let even_p37 = u64x4::splat(0x3ffffff << 37); @@ -609,8 +604,8 @@ impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 { #[inline(always)] fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) { use core::arch::aarch64::vmull_u32; - unsafe { - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), + unsafe { + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits()); (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2))) }