Skip to content

Commit

Permalink
Merge branch 'main' into feature/interleaved-complex-nums
Browse files Browse the repository at this point in the history
  • Loading branch information
smu160 authored Jul 14, 2024
2 parents f639900 + 20fafba commit 5c09b89
Show file tree
Hide file tree
Showing 11 changed files with 420 additions and 142 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ default = []
complex-nums = ["dep:num-complex", "dep:bytemuck"]

[dev-dependencies]
utilities = { path = "utilities" }
fftw = "0.8.0"
criterion = "0.5.1"
fftw = "0.8.0"
rand = "0.8.5"
utilities = { path = "utilities" }

[[bench]]
name = "bench"
Expand Down
220 changes: 144 additions & 76 deletions benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,91 +1,159 @@
#![feature(portable_simd, avx512_target_feature)]

use std::simd::{f32x16, f64x8};

use bytemuck::cast_slice;
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use num_complex::Complex;

use phastft::separate_re_im_f64;

macro_rules! impl_separate_re_im_with_capacity {
($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => {
/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
/// into separate vectors for the corresponding real and imaginary components.
#[multiversion::multiversion(
targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
pub fn $func_name(
signal: &[Complex<$precision>],
) -> (Vec<$precision>, Vec<$precision>) {
let n = signal.len();
let mut reals = Vec::with_capacity(n);
let mut imags = Vec::with_capacity(n);

let complex_t: &[$precision] = cast_slice(signal);
const CHUNK_SIZE: usize = $lanes * 2;

for chunk in complex_t
.chunks_exact(CHUNK_SIZE)
{
let (first_half, second_half) = chunk.split_at($lanes);

let a = <$simd_vec>::from_slice(&first_half);
let b = <$simd_vec>::from_slice(&second_half);
let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b);

reals.extend_from_slice(&re_deinterleaved.to_array());
imags.extend_from_slice(&im_deinterleaved.to_array());
}

let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
if !remainder.is_empty() {
for chunk in remainder.chunks_exact(2) {
reals.push(chunk[0]);
imags.push(chunk[1]);
}
}

(reals, imags)
}
};
use num_traits::Float;
use phastft::{
fft_32_with_opts_and_plan, fft_64_with_opts_and_plan,
options::Options,
planner::{Direction, Planner32, Planner64},
};
use rand::{
distributions::{Distribution, Standard},
thread_rng, Rng,
};
use utilities::rustfft::num_complex::Complex;
use utilities::rustfft::FftPlanner;

const LENGTHS: &[usize] = &[
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
];

fn generate_numbers<T: Float>(n: usize) -> (Vec<T>, Vec<T>)
where
Standard: Distribution<T>,
{
let mut rng = thread_rng();

let samples: Vec<T> = (&mut rng).sample_iter(Standard).take(2 * n).collect();

let mut reals = vec![T::zero(); n];
let mut imags = vec![T::zero(); n];

for ((z_re, z_im), rand_chunk) in reals
.iter_mut()
.zip(imags.iter_mut())
.zip(samples.chunks_exact(2))
{
*z_re = rand_chunk[0];
*z_im = rand_chunk[1];
}

(reals, imags)
}

#[cfg(feature = "complex-nums")]
impl_separate_re_im_with_capacity!(separate_re_im_f32_with_cap, f32, 16, f32x16);
fn generate_complex_numbers<T: Float + Default>(n: usize) -> Vec<Complex<T>>
where
Standard: Distribution<T>,
{
let mut rng = thread_rng();

let samples: Vec<T> = (&mut rng).sample_iter(Standard).take(2 * n).collect();

#[cfg(feature = "complex-nums")]
impl_separate_re_im_with_capacity!(separate_re_im_f64_with_cap, f64, 8, f64x8);
let mut signal = vec![Complex::default(); n];

fn criterion_benchmark(c: &mut Criterion) {
let sizes = vec![1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18, 1 << 20];
for (z, rand_chunk) in signal.iter_mut().zip(samples.chunks_exact(2)) {
z.re = rand_chunk[0];
z.im = rand_chunk[1];
}

let mut group = c.benchmark_group("separate_re_im");
for size in sizes.into_iter() {
let signal: Vec<_> = (0..size)
.map(|x| Complex::new(x as f64, (x * 2) as f64))
.collect();
signal
}

group.throughput(Throughput::Elements(size));
fn benchmark_forward_f32(c: &mut Criterion) {
let mut group = c.benchmark_group("Forward f32");

for n in LENGTHS.iter() {
let len = 1 << n;
group.throughput(Throughput::Elements(len as u64));

let id = "PhastFT FFT Forward";
let options = Options::guess_options(len);
let planner = Planner32::new(len, Direction::Forward);
let (mut reals, mut imags) = generate_numbers(len);

group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
b.iter(|| {
fft_32_with_opts_and_plan(
black_box(&mut reals),
black_box(&mut imags),
black_box(&options),
black_box(&planner),
);
});
});

group.bench_with_input(
BenchmarkId::new("with_zeroed_vecs", size),
&signal,
|b, s| b.iter(|| separate_re_im_f64(black_box(s))),
);
let id = "RustFFT FFT Forward";
let mut planner = FftPlanner::<f32>::new();
let fft = planner.plan_fft_forward(len);
let mut signal = generate_complex_numbers(len);

group.bench_with_input(BenchmarkId::new("with_capacity", size), &signal, |b, s| {
b.iter(|| separate_re_im_f64_with_cap(black_box(s)));
group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
b.iter(|| fft.process(black_box(&mut signal)));
});
}
group.finish();
}

criterion_group!(benches, criterion_benchmark);
fn benchmark_inverse_f32(c: &mut Criterion) {
let options = Options::default();

for n in LENGTHS.iter() {
let len = 1 << n;
let id = format!("FFT Inverse f32 {} elements", len);
let planner = Planner32::new(len, Direction::Reverse);

c.bench_function(&id, |b| {
let (mut reals, mut imags) = generate_numbers(len);
b.iter(|| {
black_box(fft_32_with_opts_and_plan(
&mut reals, &mut imags, &options, &planner,
));
});
});
}
}

fn benchmark_forward_f64(c: &mut Criterion) {
let options = Options::default();

for n in LENGTHS.iter() {
let len = 1 << n;
let id = format!("FFT Forward f64 {} elements", len);
let planner = Planner64::new(len, Direction::Forward);

c.bench_function(&id, |b| {
let (mut reals, mut imags) = generate_numbers(len);
b.iter(|| {
black_box(fft_64_with_opts_and_plan(
&mut reals, &mut imags, &options, &planner,
));
});
});
}
}

fn benchmark_inverse_f64(c: &mut Criterion) {
let options = Options::default();

for n in LENGTHS.iter() {
let len = 1 << n;
let id = format!("FFT Inverse f64 {} elements", len);
let planner = Planner64::new(len, Direction::Reverse);

c.bench_function(&id, |b| {
let (mut reals, mut imags) = generate_numbers(len);
b.iter(|| {
black_box(fft_64_with_opts_and_plan(
&mut reals, &mut imags, &options, &planner,
));
});
});
}
}

criterion_group!(
benches,
benchmark_forward_f32,
benchmark_inverse_f32,
benchmark_forward_f64,
benchmark_inverse_f64
);
criterion_main!(benches);
10 changes: 7 additions & 3 deletions examples/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@ use std::str::FromStr;

use utilities::gen_random_signal;

use phastft::fft_64;
use phastft::planner::Direction;
use phastft::fft_64_with_opts_and_plan;
use phastft::options::Options;
use phastft::planner::{Direction, Planner64};

fn benchmark_fft_64(n: usize) {
let big_n = 1 << n;
let mut reals = vec![0.0; big_n];
let mut imags = vec![0.0; big_n];
gen_random_signal(&mut reals, &mut imags);

let planner = Planner64::new(reals.len(), Direction::Forward);
let opts = Options::guess_options(reals.len());

let now = std::time::Instant::now();
fft_64(&mut reals, &mut imags, Direction::Forward);
fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
let elapsed = now.elapsed().as_micros();
println!("{elapsed}");
}
Expand Down
27 changes: 18 additions & 9 deletions examples/profile.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,29 @@
use std::env;
use std::str::FromStr;

use phastft::fft_64;
use phastft::planner::Direction;

fn benchmark_fft(num_qubits: usize) {
let n = 1 << num_qubits;
let mut reals: Vec<f64> = (1..=n).map(|i| i as f64).collect();
let mut imags: Vec<f64> = (1..=n).map(|i| i as f64).collect();
fft_64(&mut reals, &mut imags, Direction::Forward);
use utilities::gen_random_signal;

use phastft::fft_64_with_opts_and_plan;
use phastft::options::Options;
use phastft::planner::{Direction, Planner64};

fn benchmark_fft_64(n: usize) {
let big_n = 1 << n;
let mut reals = vec![0.0; big_n];
let mut imags = vec![0.0; big_n];
gen_random_signal(&mut reals, &mut imags);

let planner = Planner64::new(reals.len(), Direction::Forward);
let opts = Options::guess_options(reals.len());

fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
}

fn main() {
let args: Vec<String> = env::args().collect();
assert_eq!(args.len(), 2, "Usage {} <n>", args[0]);

let n = usize::from_str(&args[1]).unwrap();
benchmark_fft(n);

benchmark_fft_64(n);
}
7 changes: 4 additions & 3 deletions examples/rustfft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use utilities::{
fn benchmark_rustfft(n: usize) {
let big_n = 1 << n;

let mut reals = vec![0.0; big_n];
let mut imags = vec![0.0; big_n];
let mut reals = vec![0.0f64; big_n];
let mut imags = vec![0.0f64; big_n];

gen_random_signal(&mut reals, &mut imags);
let mut signal = vec![Complex64::default(); big_n];
Expand All @@ -23,9 +23,10 @@ fn benchmark_rustfft(n: usize) {
z.im = im;
});

let now = std::time::Instant::now();
let mut planner = FftPlanner::new();
let fft = planner.plan_fft_forward(signal.len());

let now = std::time::Instant::now();
fft.process(&mut signal);
let elapsed = now.elapsed().as_micros();
println!("{elapsed}");
Expand Down
32 changes: 32 additions & 0 deletions src/kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ use num_traits::Float;

macro_rules! fft_butterfly_n_simd {
($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => {
#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
#[inline]
pub fn $func_name(
reals: &mut [$precision],
Expand Down Expand Up @@ -52,6 +60,14 @@ macro_rules! fft_butterfly_n_simd {
fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8);
fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16);

#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
#[inline]
pub(crate) fn fft_chunk_n<T: Float>(
reals: &mut [T],
Expand Down Expand Up @@ -93,6 +109,14 @@ pub(crate) fn fft_chunk_n<T: Float>(
}

/// `chunk_size == 4`, so hard code twiddle factors
#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
#[inline]
pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
let dist = 2;
Expand Down Expand Up @@ -128,6 +152,14 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
}

/// `chunk_size == 2`, so skip phase
#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
#[inline]
pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
reals
Expand Down
Loading

0 comments on commit 5c09b89

Please sign in to comment.